In [1]:
!pip install anndata



In [2]:
!pip install scanpy



In [3]:
import scanpy as sc
import anndata
import importlib
from sklearn.decomposition import PCA

import matplotlib as mpl

In [4]:
import h5py
import anndata

# Read the data into an AnnData object
adata = anndata.read_h5ad('C:/Users/smattaparthi/CS-297/TabulaSapiens_Heart_Dataset.h5ad')

print(adata) 

AnnData object with n_obs × n_vars = 11505 × 58604
    obs: 'assay_ontology_term_id', 'donor_id', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_type', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'ensembl_version', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: '_scvi', '_training_mode', 'assay_colors', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'de

In [5]:
#shape of data matrix
print(adata.shape)

(11505, 58604)


In [6]:
# Get the dimensions of the data 
print("Number of Cells:", adata.n_obs)
print("Number of Genes:", adata.n_vars)

Number of Cells: 11505
Number of Genes: 58604


In [7]:
#view variable names(genes)
print(adata.var_names)

Index(['ENSG00000223972', 'ENSG00000227232', 'ENSG00000278267',
       'ENSG00000243485', 'ENSG00000284332', 'ENSG00000237613',
       'ENSG00000268020', 'ENSG00000240361', 'ENSG00000186092',
       'ENSG00000238009',
       ...
       'ENSG00000198886', 'ENSG00000210176', 'ENSG00000210184',
       'ENSG00000210191', 'ENSG00000198786', 'ENSG00000198695',
       'ENSG00000210194', 'ENSG00000198727', 'ENSG00000210195',
       'ENSG00000210196'],
      dtype='object', name='ensemblid', length=58604)


In [8]:
#view observation names(cell)
print(adata.obs_names)

Index(['AAACCCAAGAGCAAGA_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGATGGCGT_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGGGTTAAT_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGTATGCAA_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGTCGTTAC_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGTTGGAGC_TSP12_Heart_Atria_10X_1_1',
       'AAACCCACAAGATTGA_TSP12_Heart_Atria_10X_1_1',
       'AAACCCACAGTGACCC_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAGTAAGCTCT_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAGTCAGGTGA_TSP12_Heart_Atria_10X_1_1',
       ...
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O11_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O12_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O2_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O3_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O4_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O5_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O6_L004',
  

In [11]:
#Preprocessing

#removing cells with less than 200 genes
sc.pp.filter_cells(adata, min_genes=200)

#removing genes with less than 3 cells
sc.pp.filter_genes(adata, min_cells=3)

print(adata)

AnnData object with n_obs × n_vars = 11505 × 31755
    obs: 'assay_ontology_term_id', 'donor_id', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_type', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'ensembl_version', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'n_cells'
    uns: '_scvi', '_training_mode', 'assay_colors', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_predi

In [12]:
#Check the different cell types present in Heart Data

cell_type = adata.obs['cell_type']

print(cell_type)

cell_id
AAACCCAAGAGCAAGA_TSP12_Heart_Atria_10X_1_1                 cardiac endothelial cell
AAACCCAAGATGGCGT_TSP12_Heart_Atria_10X_1_1                 cardiac endothelial cell
AAACCCAAGGGTTAAT_TSP12_Heart_Atria_10X_1_1                      cardiac muscle cell
AAACCCAAGTATGCAA_TSP12_Heart_Atria_10X_1_1                      cardiac muscle cell
AAACCCAAGTCGTTAC_TSP12_Heart_Atria_10X_1_1                      cardiac muscle cell
                                                                     ...           
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O5_L004                  native cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O6_L004     cardiac endothelial cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O8_L004                  native cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O9_L004     cardiac endothelial cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_P10_L004                 native cell
Name: cell_type, Length: 11505, dtype: category
Categories (6, objec

# Binary Classification on 'Cardiac Muscle Cell' 

## Logistic Regression 

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Identify Cardiac Muscle Cells from the dataset
cardiac_muscle_cells = adata.obs['cell_type'] == 'cardiac muscle cell'

# Create a Binary label for cardic muscle cells
adata.obs['is_cardiac_muscle'] = cardiac_muscle_cells.astype(int)

# Extract features (gene expressions) and Label to X and y variables respectively
X = adata.X
y = adata.obs['is_cardiac_muscle']

# Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Instantiate logistic regression model
LR_Model = LogisticRegression(max_iter=1000)

#Train the model
LR_Model.fit(X_train, y_train)

# Predict Cardiac muscle cells test data
y_pred = LR_Model.predict(X_test)

print("\n\n Predicted target: 'cardiac muscle cell: '\n\n", y_pred)




 Predicted target: 'cardiac muscle cell: '

 [1 1 1 ... 0 1 0]


In [18]:
# Evaluate Logistic Regression Model performance
accuracy = accuracy_score(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
CR = classification_report(y_test, y_pred)

print("Logistic Regression - Cardic Muscle Cells-----------------------\n")
print("\nAccuracy:\n", accuracy)
print("\nConfusion Matrix:\n", CM)
print("\nClassification Report:\n", CR)

Logistic Regression - Cardic Muscle Cells-----------------------


Accuracy:
 0.990438939591482

Confusion Matrix:
 [[ 853   13]
 [   9 1426]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       866
           1       0.99      0.99      0.99      1435

    accuracy                           0.99      2301
   macro avg       0.99      0.99      0.99      2301
weighted avg       0.99      0.99      0.99      2301



## Support Vector Machine

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Identify Cardiac Muscle Cells from the dataset
cardiac_muscle_cells = adata.obs['cell_type'] == 'cardiac muscle cell'

# Create a Binary label for cardic muscle cells
adata.obs['is_cardiac_muscle'] = cardiac_muscle_cells.astype(int)

# Extract features (gene expressions) and Label to X and y variables respectively
X_svm = adata.X
y_svm = adata.obs['is_cardiac_muscle']

# Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% 
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_svm, y_svm, test_size=0.2, random_state=42)

#Instantiate Support Vector Machine model
SVM_Model = SVC(kernel ='linear', C=1.0)

#Train the model
SVM_Model.fit(X_train_svm, y_train_svm)

# Predict Cardiac muscle cells test data
y_pred_svm = SVM_Model.predict(X_test_svm)

print("\n\n Predicted target: 'cardiac muscle cell: SVM'\n\n", y_pred_svm)



 Predicted target: 'cardiac muscle cell: SVM'

 [1 1 1 ... 0 1 0]


In [20]:
# Evaluate SVM Model performance
accuracy_svm = accuracy_score(y_test_svm, y_pred_svm)
CM_svm = confusion_matrix(y_test_svm, y_pred_svm)
CR_svm = classification_report(y_test_svm, y_pred_svm)

print("Logistic Regression - Cardic Muscle Cells-----------------------\n")
print("\nAccuracy:\n", accuracy_svm)
print("\nConfusion Matrix:\n", CM_svm)
print("\nClassification Report:\n", CR_svm)

Logistic Regression - Cardic Muscle Cells-----------------------


Accuracy:
 0.9913081269013473

Confusion Matrix:
 [[ 855   11]
 [   9 1426]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       866
           1       0.99      0.99      0.99      1435

    accuracy                           0.99      2301
   macro avg       0.99      0.99      0.99      2301
weighted avg       0.99      0.99      0.99      2301

