#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('pip install anndata') # In[2]: get_ipython().system('pip install scanpy') # In[3]: import scanpy as sc import anndata import importlib from sklearn.decomposition import PCA import matplotlib as mpl # In[4]: import h5py import anndata # Read the data into an AnnData object adata = anndata.read_h5ad('C:/Users/smattaparthi/CS-297/TabulaSapiens_Heart_Dataset.h5ad') print(adata) # In[5]: #shape of data matrix print(adata.shape) # In[6]: # Get the dimensions of the data print("Number of Cells:", adata.n_obs) print("Number of Genes:", adata.n_vars) # In[7]: #view variable names(genes) print(adata.var_names) # In[8]: #view observation names(cell) print(adata.obs_names) # In[11]: #Preprocessing #removing cells with less than 200 genes sc.pp.filter_cells(adata, min_genes=200) #removing genes with less than 3 cells sc.pp.filter_genes(adata, min_cells=3) print(adata) # In[12]: #Check the different cell types present in Heart Data cell_type = adata.obs['cell_type'] print(cell_type) # In[ ]: # # Binary Classification on 'Cardiac Muscle Cell' # In[ ]: # ## Logistic Regression # In[17]: from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Identify Cardiac Muscle Cells from the dataset cardiac_muscle_cells = adata.obs['cell_type'] == 'cardiac muscle cell' # Create a Binary label for cardic muscle cells adata.obs['is_cardiac_muscle'] = cardiac_muscle_cells.astype(int) # Extract features (gene expressions) and Label to X and y variables respectively X = adata.X y = adata.obs['is_cardiac_muscle'] # Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #Instantiate logistic regression model LR_Model = LogisticRegression(max_iter=1000) #Train the model LR_Model.fit(X_train, y_train) # Predict Cardiac muscle cells test data y_pred = LR_Model.predict(X_test) print("\n\n Predicted target: 'cardiac muscle cell: '\n\n", y_pred) # In[18]: # Evaluate Logistic Regression Model performance accuracy = accuracy_score(y_test, y_pred) CM = confusion_matrix(y_test, y_pred) CR = classification_report(y_test, y_pred) print("Logistic Regression - Cardic Muscle Cells-----------------------\n") print("\nAccuracy:\n", accuracy) print("\nConfusion Matrix:\n", CM) print("\nClassification Report:\n", CR) # In[ ]: # ## Support Vector Machine # In[19]: from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Identify Cardiac Muscle Cells from the dataset cardiac_muscle_cells = adata.obs['cell_type'] == 'cardiac muscle cell' # Create a Binary label for cardic muscle cells adata.obs['is_cardiac_muscle'] = cardiac_muscle_cells.astype(int) # Extract features (gene expressions) and Label to X and y variables respectively X_svm = adata.X y_svm = adata.obs['is_cardiac_muscle'] # Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_svm, y_svm, test_size=0.2, random_state=42) #Instantiate Support Vector Machine model SVM_Model = SVC(kernel ='linear', C=1.0) #Train the model SVM_Model.fit(X_train_svm, y_train_svm) # Predict Cardiac muscle cells test data y_pred_svm = SVM_Model.predict(X_test_svm) print("\n\n Predicted target: 'cardiac muscle cell: SVM'\n\n", y_pred_svm) # In[20]: # Evaluate SVM Model performance accuracy_svm = accuracy_score(y_test_svm, y_pred_svm) CM_svm = confusion_matrix(y_test_svm, y_pred_svm) CR_svm = classification_report(y_test_svm, y_pred_svm) print("Logistic Regression - Cardic Muscle Cells-----------------------\n") print("\nAccuracy:\n", accuracy_svm) print("\nConfusion Matrix:\n", CM_svm) print("\nClassification Report:\n", CR_svm) # In[ ]: