import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Load Dataset
data = pd.read_csv('data/news-date-text.csv', error_bad_lines=False);
documents_list= data['headline_text'].tolist()
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
stop_words='english',
ngram_range = (1,1),
tokenizer = tokenizer.tokenize)
# Fit and Transform the documents
train_data_fit = tfidf.fit(documents_list)
train_data = tfidf.transform(documents_list)
C:\Users\padia\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:489: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn("The parameter 'token_pattern' will not be used"
# Define the number of topics or components
num_components=5
# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)
# Fit and Transform
lda_matrix = model.fit_transform(train_data)
# Get Components
lda_components=model.components_
# Print the topics with their terms
terms = tfidf.get_feature_names()
for index, component in enumerate(lda_components):
zipped = zip(terms, component)
top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
top_terms_list=list(dict(top_terms_key).keys())
print("Topic "+str(index)+": ",top_terms_list)
Topic 0: ['cup', 'australia', 'world', 'win', 'final', 'day', 'new'] Topic 1: ['police', 'man', 'court', 'charged', 'murder', 'accused', 'death'] Topic 2: ['crash', 'missing', 'killed', 'man', 'car', 'search', 'dead'] Topic 3: ['new', 'council', 'rural', 'plan', 'govt', 'water', 'national'] Topic 4: ['interview', 'election', 'says', 'trump', 'climate', 'labor', 'change']
x = tfidf.transform(["Pence: 'I think we'll have better choices in the future' than Trump"])
model.transform(x)
array([[0.05615564, 0.05644629, 0.05601956, 0.29934203, 0.53203649]])