# -*- coding: utf-8 -*-
"""TabulaSapiens_Heart

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1x1YAuTu1IpROnthyRovb_bTaeknoctVs
"""

#Trying with smaller dataset
#Around 400 MB

"""#Trying with smaller dataset
#Around 400 MB

"""

!pip install anndata

import numpy as np # linear algebra
import pandas as pd
import os

import time
import matplotlib.pyplot as plt
import seaborn as sns

#1 Code to read file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#Tabula Sapiens Heart Cells ------>

# https://drive.google.com/file/d/1oMOjGyf14G4dcyqrQChlD4f7TK0YV2Vf/view?usp=sharing - link to dataset

#2. Get the file
#make sure you upload all your data files to your Google drive and change share->Advanced->change->anyone with the link can view
downloaded = drive.CreateFile({'id':'1oMOjGyf14G4dcyqrQChlD4f7TK0YV2Vf'}) # replace the id with id of file you want to access
downloaded.GetContentFile('TabulaSapiens_Heart_Dataset.h5ad')

print(downloaded)

!pip install scanpy

import scanpy as sc
import anndata
import importlib
from sklearn.decomposition import PCA

import matplotlib as mpl

adata = sc.read_h5ad('TabulaSapiens_Heart_Dataset.h5ad')

print(adata)


# View basic statistics of the data
print(adata.var_names[:10])  # View the first 10 gene names

print(adata.obs.head())     # View the first few rows of the observation (cell) data

# Plot the distribution of the gene expression levels
adata.plot.density(x='rna')

# Get the dimensions of the data
print("Number of Cells:", adata.n_obs)
print("Number of Genes:", adata.n_vars)


"""Dimensionality Reduction (e.g., PCA):"""

sc.tl.pca(adata)
sc.pl.pca(adata, color=['ENSG00000223972', 'ENSG00000227232' , 'ENSG00000278267', 'ENSG00000243485', 'ENSG00000284332', 'ENSG00000237613', 'ENSG00000268020', 'ENSG00000240361', 'ENSG00000186092', 'ENSG00000238009' ])

# UMAP embedding
sc.pl.umap(adata, color='cell_type')

# PCA plot
sc.pl.pca(adata, color='cell_type')