import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# Load Dataset
data = pd.read_csv('data/news-date-text.csv', error_bad_lines=False);
documents_list= data['headline_text'].tolist()


# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data_fit = tfidf.fit(documents_list) 
train_data = tfidf.transform(documents_list)

C:\Users\padia\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:489: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"


# Define the number of topics or components
num_components=5

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_


# Print the topics with their terms
terms = tfidf.get_feature_names()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['cup', 'australia', 'world', 'win', 'final', 'day', 'new']
Topic 1:  ['police', 'man', 'court', 'charged', 'murder', 'accused', 'death']
Topic 2:  ['crash', 'missing', 'killed', 'man', 'car', 'search', 'dead']
Topic 3:  ['new', 'council', 'rural', 'plan', 'govt', 'water', 'national']
Topic 4:  ['interview', 'election', 'says', 'trump', 'climate', 'labor', 'change']


x = tfidf.transform(["Pence: 'I think we'll have better choices in the future' than Trump"])
model.transform(x)

array([[0.05615564, 0.05644629, 0.05601956, 0.29934203, 0.53203649]])