import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
df = pd.read_csv('data/user-ct-test-collection-01.txt', sep="\t")
data.head()
AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | |
---|---|---|---|---|---|---|
0 | 142 | rentdirect.com | 2006-03-01 07:17:12 | NaN | NaN | rentdirect com |
1 | 142 | www.prescriptionfortime.com | 2006-03-12 12:31:06 | NaN | NaN | www prescriptionfortim com |
2 | 142 | staple.com | 2006-03-17 21:19:29 | NaN | NaN | stapl com |
3 | 142 | staple.com | 2006-03-17 21:19:45 | NaN | NaN | stapl com |
4 | 142 | www.newyorklawyersite.com | 2006-03-18 08:02:58 | NaN | NaN | www newyorklawyersit com |
stop_words = set(stopwords.words('english'))
stop_words.update(['com', 'org', 'net', 'int', 'edu', 'gov', 'mil']);
stop_words.add('www');
st = SnowballStemmer('english')
col = 'Query'
clean_col = 'clean_Query'
# change to lower and remove spaces on either side
df[clean_col] = df[col].apply(lambda x: str(x).lower().strip())
# remove extra spaces in between
df[clean_col] = df[clean_col].apply(lambda x: re.sub(' +', ' ', x))
# remove punctuation
df[clean_col] = df[clean_col].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
# remove stopwords and get the stem
df[clean_col] = df[clean_col].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))
df.to_csv('data/cleanedData.csv', sep=',', encoding='utf-8')
df.head()
AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | |
---|---|---|---|---|---|---|
0 | 142 | rentdirect.com | 2006-03-01 07:17:12 | NaN | NaN | rentdirect |
1 | 142 | www.prescriptionfortime.com | 2006-03-12 12:31:06 | NaN | NaN | prescriptionfortim |
2 | 142 | staple.com | 2006-03-17 21:19:29 | NaN | NaN | stapl |
3 | 142 | staple.com | 2006-03-17 21:19:45 | NaN | NaN | stapl |
4 | 142 | www.newyorklawyersite.com | 2006-03-18 08:02:58 | NaN | NaN | newyorklawyersit |