In [1]:
!pip install anndata



In [2]:
!pip install scanpy



In [1]:
import scanpy as sc
import anndata
import importlib
from sklearn.decomposition import PCA

import matplotlib as mpl

In [2]:
import h5py
import anndata

# Read the data into an AnnData object
adata = anndata.read_h5ad('C:/Users/smattaparthi/CS-297/TabulaSapiens_Heart_Dataset.h5ad')

print(adata) 

AnnData object with n_obs × n_vars = 11505 × 58604
    obs: 'assay_ontology_term_id', 'donor_id', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_type', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'ensembl_version', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: '_scvi', '_training_mode', 'assay_colors', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'de

In [5]:
#shape of data matrix
print(adata.shape)

(11505, 58604)


In [6]:
# Get the dimensions of the data 
print("Number of Cells:", adata.n_obs)
print("Number of Genes:", adata.n_vars)

Number of Cells: 11505
Number of Genes: 58604


In [7]:
#view variable names(genes)
print(adata.var_names)

Index(['ENSG00000223972', 'ENSG00000227232', 'ENSG00000278267',
       'ENSG00000243485', 'ENSG00000284332', 'ENSG00000237613',
       'ENSG00000268020', 'ENSG00000240361', 'ENSG00000186092',
       'ENSG00000238009',
       ...
       'ENSG00000198886', 'ENSG00000210176', 'ENSG00000210184',
       'ENSG00000210191', 'ENSG00000198786', 'ENSG00000198695',
       'ENSG00000210194', 'ENSG00000198727', 'ENSG00000210195',
       'ENSG00000210196'],
      dtype='object', name='ensemblid', length=58604)


In [8]:
#view observation names(cell)
print(adata.obs_names)

Index(['AAACCCAAGAGCAAGA_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGATGGCGT_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGGGTTAAT_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGTATGCAA_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGTCGTTAC_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAAGTTGGAGC_TSP12_Heart_Atria_10X_1_1',
       'AAACCCACAAGATTGA_TSP12_Heart_Atria_10X_1_1',
       'AAACCCACAGTGACCC_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAGTAAGCTCT_TSP12_Heart_Atria_10X_1_1',
       'AAACCCAGTCAGGTGA_TSP12_Heart_Atria_10X_1_1',
       ...
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O11_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O12_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O2_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O3_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O4_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O5_L004',
       'TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O6_L004',
  

In [3]:
#Preprocessing

#removing cells with less than 200 genes
sc.pp.filter_cells(adata, min_genes=200)

#removing genes with less than 3 cells
sc.pp.filter_genes(adata, min_cells=3)

print(adata)

AnnData object with n_obs × n_vars = 11505 × 31755
    obs: 'assay_ontology_term_id', 'donor_id', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_type', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'ensembl_version', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'n_cells'
    uns: '_scvi', '_training_mode', 'assay_colors', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_predi

In [4]:
#Check the different cell types present in Heart Data

cell_type = adata.obs['cell_type']

print("\n\n",cell_type,"\n\n")



 cell_id
AAACCCAAGAGCAAGA_TSP12_Heart_Atria_10X_1_1                 cardiac endothelial cell
AAACCCAAGATGGCGT_TSP12_Heart_Atria_10X_1_1                 cardiac endothelial cell
AAACCCAAGGGTTAAT_TSP12_Heart_Atria_10X_1_1                      cardiac muscle cell
AAACCCAAGTATGCAA_TSP12_Heart_Atria_10X_1_1                      cardiac muscle cell
AAACCCAAGTCGTTAC_TSP12_Heart_Atria_10X_1_1                      cardiac muscle cell
                                                                     ...           
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O5_L004                  native cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O6_L004     cardiac endothelial cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O8_L004                  native cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O9_L004     cardiac endothelial cell
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_P10_L004                 native cell
Name: cell_type, Length: 11505, dtype: category
Categories (6, ob

# Neural Network for Celltype

In [15]:
pip install tensorflow

Collecting tensorflowNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/93/21/9b035a4f823d6aee2917c75415be9a95861ff3d73a0a65e48edbf210cec1/tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting tensorflow-intel==2.15.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.15.0 from https://files.pythonhosted.org/packages/4c/48/1a5a15517f18eaa4ff8d598b1c000300b20c1bb0e624539d702117a0c369/tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl.metadata (5.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.15.0->tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-

In [10]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

#  target variable is  'cell_type' - to be predicted
target_var = 'cell_type'

# Extract features (gene expressions) and target to X and y variables respectively
X = adata.X  # Features (gene expressions)
y = adata.obs[target_var]  # Target variable

print("Features:\n", X)
print("\n Target:\n", y)

# Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Convert features to float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

print("X_train\n\n")
print(X_train,"\n \n")

# Convert labels to integers
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

print("y_train\n\n")
print(y_train,"\n \n")

Features:
   (0, 18)	2.763622
  (0, 49)	5.8476806
  (0, 54)	4.0978265
  (0, 65)	3.4415958
  (0, 74)	5.109667
  (0, 78)	7.3447404
  (0, 155)	2.2818959
  (0, 176)	2.880652
  (0, 188)	2.0861118
  (0, 204)	1.689252
  (0, 229)	5.793442
  (0, 248)	4.0624876
  (0, 270)	0.0
  (0, 281)	3.211891
  (0, 349)	6.3940516
  (0, 354)	3.1601813
  (0, 370)	4.5084615
  (0, 411)	2.6280882
  (0, 430)	3.3469772
  (0, 442)	2.4396396
  (0, 445)	2.1505191
  (0, 468)	2.1812332
  (0, 469)	2.7655473
  (0, 481)	3.2527475
  (0, 484)	4.4103656
  :	:
  (11504, 31172)	1.4387573
  (11504, 31179)	3.574275
  (11504, 31276)	1.8052568
  (11504, 31349)	2.1834857
  (11504, 31363)	2.1223752
  (11504, 31623)	3.6090944
  (11504, 31625)	1.2932464
  (11504, 31628)	2.103108
  (11504, 31639)	7.0849237
  (11504, 31673)	0.0
  (11504, 31710)	4.218026
  (11504, 31722)	2.5984704
  (11504, 31724)	4.147538
  (11504, 31726)	3.05333
  (11504, 31730)	1.8903092
  (11504, 31736)	3.2195039
  (11504, 31739)	2.9446635
  (11504, 31741)	2.74616
  (1

In [14]:
print("Shape of X")
print(X_train.shape[1])
print("\n\n",X_train[1])

print("\n\nShape of X[0]")
print(X_train.shape[0])
print("\n\n",X_train[0],"\n \n")

print("Shape of y")
print(y_train.shape)

Shape of X
31755


   (0, 18)	3.106789
  (0, 38)	2.7361465
  (0, 61)	6.8653507
  (0, 65)	2.373112
  (0, 69)	2.3965585
  (0, 80)	2.47878
  (0, 91)	4.413015
  (0, 108)	3.8094842
  (0, 113)	2.724493
  (0, 155)	2.1952238
  (0, 172)	9.35265
  (0, 230)	4.745528
  (0, 246)	3.4164739
  (0, 269)	0.0
  (0, 313)	5.64313
  (0, 349)	6.096932
  (0, 378)	2.4209769
  (0, 399)	3.1705682
  (0, 434)	2.3221972
  (0, 442)	3.0785918
  (0, 445)	2.645974
  (0, 448)	3.0708525
  (0, 455)	3.291096
  (0, 468)	2.901763
  (0, 469)	3.6790943
  :	:
  (0, 31489)	3.9680712
  (0, 31501)	3.3459125
  (0, 31524)	2.2131095
  (0, 31606)	2.3384485
  (0, 31611)	2.4343443
  (0, 31623)	2.8612812
  (0, 31628)	2.79073
  (0, 31631)	4.4204054
  (0, 31651)	7.2368093
  (0, 31670)	1.6851083
  (0, 31722)	3.497754
  (0, 31724)	4.377701
  (0, 31726)	2.4660742
  (0, 31730)	3.1230712
  (0, 31736)	4.3418174
  (0, 31739)	4.935007
  (0, 31741)	1.7485303
  (0, 31742)	4.046384
  (0, 31743)	4.1394773
  (0, 31745)	3.927436
  (0, 31746)	2.3426616
 

In [15]:
# Build neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)


model.evaluate(X_test, y_test)

# Predict test data
predictions = model.predict(X_test)

model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               4064768   
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 6)                 390       
                                                                 
Total params: 4073414 (15.54 MB)
Trainable params: 4073414 (15.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
print(predictions)

[[3.5486576e-07 9.9999416e-01 8.3667079e-10 4.0741540e-07 5.1478210e-06
  2.4225773e-08]
 [5.2907646e-14 1.0000000e+00 4.8892190e-22 5.3152849e-13 1.1235023e-16
  2.7523472e-17]
 [1.9737114e-07 9.9999917e-01 3.8139464e-11 5.3827546e-07 7.5146875e-09
  2.9584294e-08]
 ...
 [1.4173116e-09 3.2792402e-09 1.5665358e-10 1.0369133e-09 1.0000000e+00
  4.3874722e-09]
 [3.8508168e-12 1.0000000e+00 2.6779985e-22 1.0225645e-12 1.8388519e-17
  8.0159962e-17]
 [3.9318202e-15 2.1232185e-12 7.6808178e-09 1.0457083e-12 1.0000000e+00
  7.1759878e-13]]
