genome_population_projection.ipynb
Visualize your genotypes in 1000 Genome Project among the population structure.
Please log in to comment.This notebook really helped me transform yor genotype data into reduced dimensions
This notebook will show you how to transform yor genotype data into reduced dimensions using the 1kGP as reference populations.
Dimensionality Redcution techniques include PCA, t-SNE, and UMAP.
If you don't want to click through each cell of this notebook and you only care about the cool visualizations, just select Run All from Cell in the menu bar above.
Also, check out the interactive visualization app.
The reference .vcf
has been created and steps documented by subsetting 1kGP sample genotypes to ancestry informative SNPs (AISNPs).
0 - ref/ref
1 - ref/alt
2 - alt/alt
3 - Unknown
{PCA, TSNE, or UMAP}
This notebook will show you how to transform yor genotype data into reduced dimensions using the 1kGP as reference populations.
Dimensionality Redcution techniques include PCA, t-SNE, and UMAP.
If you don't want to click through each cell of this notebook and you only care about the cool visualizations, just select Run All from Cell in the menu bar above.
Also, check out the interactive visualization app.
import bz2
import os
import io
import requests
import pandas as pd
import pysam
import numpy as np
import umap
from io import StringIO
from MulticoreTSNE import MulticoreTSNE as TSNE
from ohapi import api
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import vcf
import warnings
warnings.filterwarnings('ignore')
# plotting
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
%matplotlib inline
The reference .vcf
has been created and steps documented by subsetting 1kGP sample genotypes to ancestry informative SNPs (AISNPs).
def vcf2df(vcf_fname):
"""Convert a subsetted vcf file to pandas DataFrame
and return sample-level population data"""
samples = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel'
dfsamples = pd.read_csv(samples, sep='\t')
dfsamples.set_index('sample', inplace=True)
dfsamples.drop(['Unnamed: 4', 'Unnamed: 5'], inplace=True, axis=1)
vcf_file = requests.get(vcf_fname)
buffered = StringIO(vcf_file.content.decode("utf-8"))
vcf_reader = vcf.Reader(buffered, 'rb')
df = pd.DataFrame(index=vcf_reader.samples)
for variant in vcf_reader:
df[variant.ID] = [call.gt_type if call.gt_type is not None else 3 for call in variant.samples]
df = df.join(dfsamples, how='outer')
df = df.drop(['pop', 'super_pop', 'gender'], axis=1)
return df, dfsamples
# Kidd et al.
aisnps = 55
# Kosoy et al.
#aisnps = 128
if aisnps == 55:
vcf_fname = 'https://s3.amazonaws.com/ancestry-snps-1kgp/Kidd.55AISNP.1kG.vcf'
elif aisnps == 128:
vcf_fname = 'https://s3.amazonaws.com/ancestry-snps-1kgp/Seldin.128AISNP.1kG.vcf'
df, dfsamples = vcf2df(vcf_fname)
0 - ref/ref
1 - ref/alt
2 - alt/alt
3 - Unknown
df.head(3)
dfsamples.head(3)
user = api.exchange_oauth2_member(os.environ.get('OH_ACCESS_TOKEN'))
# 128 - 23andMe Upload
# 129 - AncestryDNA Upload
# 120 - FamilyTreeDNA Upload
# 40 - Gencove Upload
# 131 - Genome/Exome Upload
# uncomment the project_source below depending on which you would like to use
project_source = 'direct-sharing-128'
#project_source = 'direct-sharing-129'
#project_source = 'direct-sharing-120'
#project_source = 'direct-sharing-40'
#project_source = 'direct-sharing-131'
# Get a .vcf from OpenHumans
for record in user['data']:
if record['source'] == project_source and '.vcf' in record['basename']:
print('Found your genotype data in Open Humans!')
datafile = requests.get(record['download_url'])
with open('member.vcf', 'wb') as handle:
try:
try:
try:
textobj = bz2.decompress(datafile.content)
handle.write(textobj)
except OSError:
textobj = gzip.decompress(datafile.content)
handle.write(textobj)
except OSError:
for block in datafile.iter_content(1024):
handle.write(block)
except:
logger.critical('your data source file is malformated')
break
# start with UNKNOWN genotypes == 3 in pyvcf
df.loc['me'] = np.repeat([3], df.shape[1])
dfsamples.loc['me'] = ['me', 'me', 'male']
# read the raw AISNPs tables from the manuscripts, w/ genomic location and rsid
if aisnps == 55:
dfaim = pd.read_csv('https://s3.amazonaws.com/ancestry-snps-1kgp/Kidd_55_AISNPs.txt', sep='\t')
elif aisnps == 128:
dfaim = pd.read_csv('https://s3.amazonaws.com/ancestry-snps-1kgp/Seldin_128_AISNPs.txt', sep='\t')
%%bash
pbgzip member.vcf
tabix member.vcf.gz
#buffered = StringIO(textobj.decode("utf-8"))
vcf_reader = vcf.Reader(filename='member.vcf.gz')
for i, row in dfaim.iterrows():
chrom = row['Chr']
pos = row['Build 37 nt position'].replace(',', '')
rsid = row['dbSNP rs#']
for variant in vcf_reader.fetch(str(chrom), int(pos)-1, int(pos)):
df.loc['me', rsid] = [call.gt_type if call.gt_type is not None else 3 for call in variant.samples][0]
df.tail(3)
ncols = len(df.columns)
ohe = OneHotEncoder(categories=[range(4)] * ncols, sparse=False)
X = ohe.fit_transform(df.values)
def reduce_dim(X, algorithm='PCA', n_components=4):
"""Reduce the dimensionality of the 55 AISNPs
:param X: One-hot encoded 1kG 55 AISNPs.
:type X: array
:param algorithm: The type of dimensionality reduction to perform.
One of {PCA, UMAP, TSNE}
:type algorithm: str
:param n_components: The number of components to return in X_red
:type n_components: int
:returns: The transformed X[m, n] array, reduced to X[m, n_components] by algorithm.
"""
if algorithm == 'PCA':
X_red = PCA(n_components=n_components).fit_transform(X)
elif algorithm == 'TSNE':
# TSNE, Barnes-Hut have dim <= 3
if n_components > 3:
print('The Barnes-Hut method requires the dimensionaility to be <= 3')
return None
else:
X_red = TSNE(n_components=n_components, n_jobs=4).fit_transform(X)
elif algorithm == 'UMAP':
X_red = umap.UMAP(n_components=n_components).fit_transform(X)
else:
return None
return X_red
{PCA, TSNE, or UMAP}
X_emb = reduce_dim(X, algorithm='PCA', n_components=3)
def encode_class(pop_level='pop'):
"""Encode the population lables for plotting."""
le = LabelEncoder()
if pop_level == 'pop':
labels = le.fit_transform(dfsamples['pop'].values)
elif pop_level == 'super_pop':
labels = le.fit_transform(dfsamples['super_pop'].values)
else:
return None
return le, labels
# pop_level can be either 'pop' or 'super_pop'
le, labels = encode_class(pop_level='super_pop')
def plot_samples(X_emb, x_component=None, y_component=None):
""""""
unique = np.unique(labels)
colors = [plt.cm.tab10_r(i/float(len(unique)-1)) for i in range(len(unique))]
assignments = [colors[i] for i in labels]
plt.figure(figsize=(10,10));
for (i,cla) in enumerate(set(labels)):
s = None
if le.inverse_transform([cla])[0] == 'me':
s=500
if le.inverse_transform([cla])[0] == 'me_imputed':
s=500
xc = [p for (j,p) in enumerate(X_emb[:, x_component-1]) if labels[j]==cla]
yc = [p for (j,p) in enumerate(X_emb[:, y_component-1]) if labels[j]==cla]
cols = [c for (j,c) in enumerate(assignments) if labels[j]==cla]
plt.scatter(xc, yc, s=s, c=cols, label=le.inverse_transform([cla])[0])
plt.legend();
plt.xlabel('Component {}'.format(x_component));
plt.ylabel('Component {}'.format(y_component));
plt.title('Projection of 1000 Genomes Samples\ninto Lower Dimensional Space');
plot_samples(X_emb, x_component=1, y_component=2)
# using your imputed genotypes from Imputer -- might take a minute to download
for record in user['data']:
if record['basename'] == 'member.imputed.vcf.bz2':
print('Found your imputed genotype data in Open Humans!')
with open('member.imputed.vcf', 'wb') as handle:
datafile = requests.get(record['download_url']).content
datafile = bz2.decompress(datafile)
handle.write(datafile)
break
%%bash
pbgzip member.imputed.vcf
tabix member.imputed.vcf.gz
me_gts = df.loc['me'].value_counts()
# start with UNKNOWN genotypes == 3 in pyvcf
df.loc['me_imputed'] = np.repeat([3], df.shape[1])
dfsamples.loc['me_imputed'] = ['me_imputed', 'me_imputed', 'male']
myvcf = vcf.Reader(filename='member.imputed.vcf.gz')
for i, row in dfaim.iterrows():
chrom = row['Chr']
pos = row['Build 37 nt position'].replace(',', '')
rsid = row['dbSNP rs#']
for variant in myvcf.fetch(str(chrom), int(pos)-1, int(pos)):
df.loc['me_imputed', rsid] = [call.gt_type if call.gt_type is not None else 3 for call in variant.samples][0]
ncols = len(df.columns)
ohe = OneHotEncoder(categories=[range(4)] * ncols, sparse=False)
X = ohe.fit_transform(df.values)
X_emb = reduce_dim(X, algorithm='PCA', n_components=3)
le, labels = encode_class(pop_level='super_pop')
plot_samples(X_emb, x_component=1, y_component=2)
me_gts_imputed = df.loc['me_imputed'].value_counts()
# The count of 2's is the number of missing genotypes in your un-imputed vcf
me_gts
# The count of 3's is the number of missing genotypes in your imputed vcf
me_gts_imputed