#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().system('pip install anndata')


# In[2]:


get_ipython().system('pip install scanpy')


# In[3]:


import scanpy as sc
import anndata
import importlib
from sklearn.decomposition import PCA

import matplotlib as mpl


# In[5]:


import h5py
import anndata

# Read the data into an AnnData object
adata = anndata.read_h5ad('C:/Users/smattaparthi/CS-297/TabulaSapiens_Heart_Dataset.h5ad')

print(adata) 


# In[6]:


#shape of data matrix
print(adata.shape)


# In[7]:


# Get the dimensions of the data 
print("Number of Cells:", adata.n_obs)
print("Number of Genes:", adata.n_vars)


# In[8]:


#view variable names(genes)
print(adata.var_names)


# In[9]:


#view observation names(cell)
print(adata.obs_names)


# In[10]:


#view first few rows of data
print("First 5 rows:\n", adata.X[:5])


# In[11]:


#Preprocessing


# In[13]:


#removing cells with less than 200 genes
sc.pp.filter_cells(adata, min_genes=200)


# In[14]:


#removing genes with less than 3 cells
sc.pp.filter_genes(adata, min_cells=3)


# In[15]:


print(adata)


# In[16]:


# After Preprocessing dimensions of the data 
print("Number of Cells:", adata.n_obs)
print("Number of Genes:", adata.n_vars)


# # Classification

# ## Logistic Regression

# In[21]:


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#  target variable is  'cell_type' - to be predicted
target_var = 'cell_type'

# Extract features (gene expressions) and target to X and y variables respectively
X = adata.X  # Features (gene expressions)
y = adata.obs[target_var]  # Target variable

print("Features:\n", X)
print("\n Target:\n", y)

# Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

#Instantiate logistic regression model
LR_Model = LogisticRegression(max_iter=1000)
#Train the model
LR_Model.fit(X_train, y_train)

# Predict test data
y_pred = LR_Model.predict(X_test)

print("\n\n Predicted target: cell_type\n\n", y_pred)


# In[23]:


# Model Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
CR = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", CR)


# In[ ]:


# ## Support Vector Machine (SVM) 

# In[24]:


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

#  target variable is  'cell_type' - to be predicted
target_var_svm = 'cell_type'

# Extract features (gene expressions) and target to X and y variables respectively
X_svm = adata.X  # Features (gene expressions)
y_svm = adata.obs[target_var_svm]  # Target variable

print("Features:\n", X_svm)
print("\n Target:\n\n", y_svm)

# Spliting data into train and test sets with train_test_split, # training data: 80%, testing data: 20% 
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_svm, y_svm, test_size=0.2, random_state=42) 

#Instantiate Support Vector Machine model
SVM_Model = SVC(kernel ='linear', C=1.0)
#Train the model
SVM_Model.fit(X_train_svm, y_train_svm)

# Predict test data
y_pred_svm = SVM_Model.predict(X_test_svm)

print("\n\n Predicted target with SVM: cell_type\n\n", y_pred_svm)


# In[25]:


# Model Accuracy and Classification Report to measure performance
accuracy_svm = accuracy_score(y_test_svm, y_pred_svm)
CR_svm = classification_report(y_test_svm, y_pred_svm)

print("Accuracy SVM:", accuracy_svm)
print("Classification Report SVM:\n", CR_svm)


# In[ ]:


# # Clustering

# ## K-Means  on Genes (X)

# In[ ]:


# In[28]:


from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# number of clusters - value of K
clusters_k = 6

print("Number of Clusters:",clusters_k, "\n\n")

#Initialize K-Means model 
K_Means = KMeans(n_clusters=clusters_k, random_state=0)

#train the model
K_Means.fit(adata.X)

# Extract cluster labels for each cell
c_labels = K_Means.labels_
print("\n \n Cluster Labels:\n\n",c_labels)

# Add cluster labels to AnnData object
adata.obs['kmeans_clusters'] = c_labels

# View clusters using PCA
sc.pl.pca(adata, color=['kmeans_clusters'], title="K-Means Clustering of Heart Data")


# In[29]:


# Plot cluster centers or centroids
centroids = K_Means.cluster_centers_
print("Centroids: ",centroids,"\n \n")

plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='green', label='Cluster Centers')
plt.legend()
plt.title("K-Means Clusters with Cluster Centers(Centroids)")
plt.show()


# ## K-Means  on Principal Components of Genes (X_pca)

# In[30]:


from sklearn.cluster import KMeans

# number of clusters - value of K
clusters_pca = 6

print("Number of Clusters:",clusters_pca, "\n\n")

# Extract first 10 principal components
X_pca = adata.obsm['X_pca'][:, :10]

#Initialize K-Means model 
K_Means_pca = KMeans(n_clusters=clusters_pca)

#train the model
c_labels_pca = K_Means_pca.fit_predict(X_pca)

print("Cluster Labels: ",c_labels_pca,"\n \n")

# Add cluster labels to AnnData object
adata.obs['kmeans_clusters_pca'] = c_labels_pca

# Visualize  clusters using UMAP
sc.pl.umap(adata, color='kmeans_clusters_pca')


# In[ ]:


# ## Hierarchical Clustering

# In[33]:


from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate linkage matrix after converting csr_matrix to dense matrix
linkage_matrix = linkage(adata.X.toarray(), method='ward') # ward minimizes the variance of distances between clusters

# Plot the dendrogram
plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix, p=5, truncate_mode='level', orientation='top', show_leaf_counts=True)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Cells")
plt.ylabel("Distance")
plt.show()

# Determine number of clusters 
max_dist = 2000  # Set a threshold for the distance to cut the dendrogram
clusters_h = fcluster(linkage_matrix, max_dist, criterion='distance')
num_clusters = len(set(clusters_h))

# Add cluster labels to AnnData object
adata.obs['hierarchical_clusters'] = num_clusters

# Visualize the clusters (for example, using PCA)
sc.pl.pca(adata, color=['hierarchical_clusters'], title="Hierarchical Clustering for Heart Data")


# In[ ]: