In [48]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
In [49]:
df = pd.read_csv('data/user-ct-test-collection-01.txt', sep="\t")
data.head()
Out[49]:
AnonID Query QueryTime ItemRank ClickURL clean_Query
0 142 rentdirect.com 2006-03-01 07:17:12 NaN NaN rentdirect com
1 142 www.prescriptionfortime.com 2006-03-12 12:31:06 NaN NaN www prescriptionfortim com
2 142 staple.com 2006-03-17 21:19:29 NaN NaN stapl com
3 142 staple.com 2006-03-17 21:19:45 NaN NaN stapl com
4 142 www.newyorklawyersite.com 2006-03-18 08:02:58 NaN NaN www newyorklawyersit com
In [50]:
stop_words = set(stopwords.words('english'))
stop_words.update(['com', 'org', 'net', 'int', 'edu', 'gov', 'mil']);
stop_words.add('www');
st = SnowballStemmer('english')
In [51]:
col = 'Query'
clean_col = 'clean_Query'
In [52]:
# change to lower and remove spaces on either side
df[clean_col] = df[col].apply(lambda x: str(x).lower().strip())
In [53]:
# remove extra spaces in between
df[clean_col] = df[clean_col].apply(lambda x: re.sub(' +', ' ', x))
In [54]:
# remove punctuation
df[clean_col] = df[clean_col].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
In [55]:
# remove stopwords and get the stem
df[clean_col] = df[clean_col].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))
In [56]:
df.to_csv('data/cleanedData.csv', sep=',', encoding='utf-8')
In [57]:
df.head()
Out[57]:
AnonID Query QueryTime ItemRank ClickURL clean_Query
0 142 rentdirect.com 2006-03-01 07:17:12 NaN NaN rentdirect
1 142 www.prescriptionfortime.com 2006-03-12 12:31:06 NaN NaN prescriptionfortim
2 142 staple.com 2006-03-17 21:19:29 NaN NaN stapl
3 142 staple.com 2006-03-17 21:19:45 NaN NaN stapl
4 142 www.newyorklawyersite.com 2006-03-18 08:02:58 NaN NaN newyorklawyersit
In [ ]:
 
In [ ]:
 
In [ ]: