import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


df = pd.read_csv('data/user-ct-test-collection-01.txt', sep="\t")
data.head()


stop_words = set(stopwords.words('english'))
stop_words.update(['com', 'org', 'net', 'int', 'edu', 'gov', 'mil']);
stop_words.add('www');
st = SnowballStemmer('english')


col = 'Query'
clean_col = 'clean_Query'


# change to lower and remove spaces on either side
df[clean_col] = df[col].apply(lambda x: str(x).lower().strip())


# remove extra spaces in between
df[clean_col] = df[clean_col].apply(lambda x: re.sub(' +', ' ', x))


# remove punctuation
df[clean_col] = df[clean_col].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))


# remove stopwords and get the stem
df[clean_col] = df[clean_col].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))


df.to_csv('data/cleanedData.csv', sep=',', encoding='utf-8')


df.head()

	AnonID	Query	QueryTime	ItemRank	ClickURL	clean_Query
0	142	rentdirect.com	2006-03-01 07:17:12	NaN	NaN	rentdirect com
1	142	www.prescriptionfortime.com	2006-03-12 12:31:06	NaN	NaN	www prescriptionfortim com
2	142	staple.com	2006-03-17 21:19:29	NaN	NaN	stapl com
3	142	staple.com	2006-03-17 21:19:45	NaN	NaN	stapl com
4	142	www.newyorklawyersite.com	2006-03-18 08:02:58	NaN	NaN	www newyorklawyersit com

	AnonID	Query	QueryTime	ItemRank	ClickURL	clean_Query
0	142	rentdirect.com	2006-03-01 07:17:12	NaN	NaN	rentdirect
1	142	www.prescriptionfortime.com	2006-03-12 12:31:06	NaN	NaN	prescriptionfortim
2	142	staple.com	2006-03-17 21:19:29	NaN	NaN	stapl
3	142	staple.com	2006-03-17 21:19:45	NaN	NaN	stapl
4	142	www.newyorklawyersite.com	2006-03-18 08:02:58	NaN	NaN	newyorklawyersit