#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('pip install anndata') # In[2]: get_ipython().system('pip install scanpy') # In[3]: import scanpy as sc import anndata import importlib from sklearn.decomposition import PCA import matplotlib as mpl # In[5]: import h5py import anndata # Read the data into an AnnData object adata = anndata.read_h5ad('C:/Users/smattaparthi/CS-297/TabulaSapiens_Heart_Dataset.h5ad') print(adata) # In[6]: #shape of data matrix print(adata.shape) # In[7]: # Get the dimensions of the data print("Number of Cells:", adata.n_obs) print("Number of Genes:", adata.n_vars) # In[8]: #view variable names(genes) print(adata.var_names) # In[9]: #view observation names(cell) print(adata.obs_names) # In[10]: #view first few rows of data print("First 5 rows:\n", adata.X[:5]) # In[11]: #Preprocessing # In[13]: #removing cells with less than 200 genes sc.pp.filter_cells(adata, min_genes=200) # In[14]: #removing genes with less than 3 cells sc.pp.filter_genes(adata, min_cells=3) # In[15]: print(adata) # In[16]: # After Preprocessing dimensions of the data print("Number of Cells:", adata.n_obs) print("Number of Genes:", adata.n_vars) # # Classification # ## Logistic Regression # In[21]: from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report # target variable is 'cell_type' - to be predicted target_var = 'cell_type' # Extract features (gene expressions) and target to X and y variables respectively X = adata.X # Features (gene expressions) y = adata.obs[target_var] # Target variable print("Features:\n", X) print("\n Target:\n", y) # Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #Instantiate logistic regression model LR_Model = LogisticRegression(max_iter=1000) #Train the model LR_Model.fit(X_train, y_train) # Predict test data y_pred = LR_Model.predict(X_test) print("\n\n Predicted target: cell_type\n\n", y_pred) # In[23]: # Model Accuracy and Classification Report accuracy = accuracy_score(y_test, y_pred) CR = classification_report(y_test, y_pred) print("Accuracy:", accuracy) print("Classification Report:\n", CR) # In[ ]: # ## Support Vector Machine (SVM) # In[24]: from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import accuracy_score, classification_report # target variable is 'cell_type' - to be predicted target_var_svm = 'cell_type' # Extract features (gene expressions) and target to X and y variables respectively X_svm = adata.X # Features (gene expressions) y_svm = adata.obs[target_var_svm] # Target variable print("Features:\n", X_svm) print("\n Target:\n\n", y_svm) # Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_svm, y_svm, test_size=0.2, random_state=42) #Instantiate Support Vector Machine model SVM_Model = SVC(kernel ='linear', C=1.0) #Train the model SVM_Model.fit(X_train_svm, y_train_svm) # Predict test data y_pred_svm = SVM_Model.predict(X_test_svm) print("\n\n Predicted target with SVM: cell_type\n\n", y_pred_svm) # In[25]: # Model Accuracy and Classification Report to measure performance accuracy_svm = accuracy_score(y_test_svm, y_pred_svm) CR_svm = classification_report(y_test_svm, y_pred_svm) print("Accuracy SVM:", accuracy_svm) print("Classification Report SVM:\n", CR_svm) # In[ ]: # # Clustering # ## K-Means on Genes (X) # In[ ]: # In[28]: from sklearn.cluster import KMeans import matplotlib.pyplot as plt # number of clusters - value of K clusters_k = 6 print("Number of Clusters:",clusters_k, "\n\n") #Initialize K-Means model K_Means = KMeans(n_clusters=clusters_k, random_state=0) #train the model K_Means.fit(adata.X) # Extract cluster labels for each cell c_labels = K_Means.labels_ print("\n \n Cluster Labels:\n\n",c_labels) # Add cluster labels to AnnData object adata.obs['kmeans_clusters'] = c_labels # View clusters using PCA sc.pl.pca(adata, color=['kmeans_clusters'], title="K-Means Clustering of Heart Data") # In[29]: # Plot cluster centers or centroids centroids = K_Means.cluster_centers_ print("Centroids: ",centroids,"\n \n") plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='green', label='Cluster Centers') plt.legend() plt.title("K-Means Clusters with Cluster Centers(Centroids)") plt.show() # ## K-Means on Principal Components of Genes (X_pca) # In[30]: from sklearn.cluster import KMeans # number of clusters - value of K clusters_pca = 6 print("Number of Clusters:",clusters_pca, "\n\n") # Extract first 10 principal components X_pca = adata.obsm['X_pca'][:, :10] #Initialize K-Means model K_Means_pca = KMeans(n_clusters=clusters_pca) #train the model c_labels_pca = K_Means_pca.fit_predict(X_pca) print("Cluster Labels: ",c_labels_pca,"\n \n") # Add cluster labels to AnnData object adata.obs['kmeans_clusters_pca'] = c_labels_pca # Visualize clusters using UMAP sc.pl.umap(adata, color='kmeans_clusters_pca') # In[ ]: # ## Hierarchical Clustering # In[33]: from scipy.cluster.hierarchy import linkage, dendrogram, fcluster import matplotlib.pyplot as plt import seaborn as sns # Calculate linkage matrix after converting csr_matrix to dense matrix linkage_matrix = linkage(adata.X.toarray(), method='ward') # ward minimizes the variance of distances between clusters # Plot the dendrogram plt.figure(figsize=(12, 6)) dendrogram(linkage_matrix, p=5, truncate_mode='level', orientation='top', show_leaf_counts=True) plt.title("Hierarchical Clustering Dendrogram") plt.xlabel("Cells") plt.ylabel("Distance") plt.show() # Determine number of clusters max_dist = 2000 # Set a threshold for the distance to cut the dendrogram clusters_h = fcluster(linkage_matrix, max_dist, criterion='distance') num_clusters = len(set(clusters_h)) # Add cluster labels to AnnData object adata.obs['hierarchical_clusters'] = num_clusters # Visualize the clusters (for example, using PCA) sc.pl.pca(adata, color=['hierarchical_clusters'], title="Hierarchical Clustering for Heart Data") # In[ ]: