import pandas as pd
import numpy as np


df = pd.read_csv('data/cleanedData.csv', sep=",")
df.head()


def add_timestamp(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    return datetime_object.timestamp()
df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
df.sort_values("TIMESTAMP")


#df = df[:20000]
train_split = 0.5
size = df.shape[0]
train_size = int(size * train_split)
training_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]
df = training_df


import datetime
day_time_second_start = 7 * 3600
day_time_second_end = 19 * 3600
def is_day_time(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    seconds_of_day = datetime_object.hour * 3600 + datetime_object.minute * 60 + datetime_object.second
    if day_time_second_start<= seconds_of_day and seconds_of_day <= day_time_second_end:
        return True
    else:
        return False
df["IS_DAY_TIME"] = df['QueryTime'].apply(is_day_time)


def query_hour(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    return datetime_object.hour
df["QUERY_HOUR"] = df['QueryTime'].apply(query_hour)
df.head()


def assign_query_time(l):
    length = len(l)
    true_count = sum(l)
    if true_count/length >= 0.8:
        # is day_time
        return 1
    elif true_count/length <= 0.2:
        # is night_time
        return 0
    else:
        # is any_time
        return 2
query_time_compatibility_expected = df.groupby('clean_Query').agg(assign_query_time)
query_time_compatibility_expected_dict = query_time_compatibility_expected.to_dict()


df = df[df['clean_Query'].notna()]
def populate_assigned_query_time(query):
    return query_time_compatibility_expected_dict["IS_DAY_TIME"][query]
df["EXPECTED_DAY_TIME"] = df['clean_Query'].apply(populate_assigned_query_time)
df.head()

<ipython-input-96-c9c24ded9df4>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["EXPECTED_DAY_TIME"] = df['clean_Query'].apply(populate_assigned_query_time)


def is_compatible(row):
    if(row["EXPECTED_DAY_TIME"]==1 and not row["IS_DAY_TIME"]):
        return False
    if(row["EXPECTED_DAY_TIME"]==0 and row["IS_DAY_TIME"]):
        return False
    return True
df["IS_COMPAT"] = df.apply(is_compatible, axis = 1)

<ipython-input-97-247ccc1e5eec>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["IS_COMPAT"] = df.apply(is_compatible, axis = 1)


query_freq = df['clean_Query'].value_counts().to_dict()
df["TOTAL_FREQ"] = df['clean_Query'].apply(lambda x : query_freq[x])
df.head()

<ipython-input-98-d482504428c0>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TOTAL_FREQ"] = df['clean_Query'].apply(lambda x : query_freq[x])


def add_timestamp(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    return datetime_object.timestamp()
df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
df.head()

<ipython-input-99-d86c64f1d509>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)


day_time_second_start = 7 * 3600
day_time_second_end = 19 * 3600
def agg_all_query_hour(l):
    day = 0
    night = 0
    for datetime_str in l:
        datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
        seconds_of_day = datetime_object.hour * 3600 + datetime_object.minute * 60 + datetime_object.second
        if day_time_second_start<= seconds_of_day and seconds_of_day <= day_time_second_end:
            day += 1
        else:
            night += 1
    NT_FREQ_DAY = day/12
    NT_FREQ_NIGHT = night/12
    NT_FREQ24 = (day+night)/24
    return (NT_FREQ24, NT_FREQ_DAY, NT_FREQ_NIGHT);
query_all_timestamps = df.groupby('clean_Query')[["QueryTime"]].agg(agg_all_query_hour)
query_all_timestamps_dict = query_all_timestamps.to_dict()


df_q = pd.DataFrame.from_dict(query_all_timestamps_dict.get('QueryTime'),orient='index')


df_q_sorted0 = df_q.sort_values(0, ascending=False).reset_index(level=0)
df_q_sorted1 = df_q.sort_values(1, ascending=False).reset_index(level=0)
df_q_sorted2 = df_q.sort_values(2, ascending=False).reset_index(level=0)


l0 = list(df_q_sorted0['index'])
l1  = list(df_q_sorted1['index'])
l2 = list(df_q_sorted2['index'])


s_size = 50
ss_size = 30


s_list = list(df_q_sorted0[:s_size]['index'])
print(s_list)

['googl', 'yahoo', 'ebay', 'myspac', 'mapquest', 'space', 'hotmail', 'pogo', 'msn', 'craigslist', 'http', 'bankofamerica', 'map quest', 'weather', 'internet', 'dictionari', 'yahoo mail', 'american idol', 'horoscop', 'bank america', 'walmart', 'ask', 'ask jeev', 'n e evergreen solar', 'target', 'amazon', 'ir', 'tattoo', 'map', 'sexi swimwear', 'southwest airlin', 'lyric', 'qvc', 'low', 'porn', 'orbitz', 'fidel', 'yahoo person', 'verizon', 'home depot', 'sex', 'expedia', 'cnn', 'disney channel', 'sear', 'sin midnight kimber logan', 'donni mcclurkin', 'wachovia', 'eharmoni', 'traveloc']


ss_day_list = []
count = 0
for q in l1:
    if q in s_list:
        continue
    ss_day_list.append(q)
    if len(ss_day_list)==ss_size:
        break
print(ss_day_list)

['yellow page', 'clive owen', 'amex bema gold', 'sale rep china product', 'kimber logan', 'nyse wit', 'craig list', 'amex bgo', 'monster', 'ticketmast', 'nick', 'blackplanet', 'nyse hl', 'white page', 'usp', 'cartoonetwork', 'donbest', 'nyse eslr', 'nascar', 'hsn', 'cingular', 'runescap', 'adserv ign', 'up', 'best buy', 'n e wipro', 'dogpil', 'realtor', 'match', 'mycl cravelyr']


ss_night_list = []
count = 0
for q in l2:
    if q in s_list:
        continue
    ss_night_list.append(q)
    if len(ss_night_list)==ss_size:
        break
print(ss_night_list)

['miami heat', 'myspac layout', 'mycl cravelyr', 'daniel pipe', 'swimwear', 'adserv ign', 'gold', 'donni mcclurkin cd', 'mailbox', 'nbc', 'realtor', 'yahoo game', 'hilari duff', 'american airlin', 'gl ch section relat wetland protect act', 'limewir', 'uproar', 'deal deal', 'pink world', 'donbest', 'cingular', 'raven riley', 'profileedit myspac', 'goggl', 'sheer swimwear', 'runescap', 'espn', 'match', 'slingo', 'n']


import pickle
with open('data/all_list.pkl', 'wb') as f:
    pickle.dump(l0, f)
with open('data/day_list.pkl', 'wb') as f:
    pickle.dump(l1, f)
with open('data/night_list.pkl', 'wb') as f:
    pickle.dump(l2, f)


df.to_csv('data/ssd_train_large.csv', sep=',', encoding='utf-8')
test_df.to_csv('data/ssd_test_large.csv', sep=',', encoding='utf-8')

	Unnamed: 0	AnonID	Query	QueryTime	ItemRank	ClickURL	clean_Query
0	0	142	rentdirect.com	2006-03-01 07:17:12	NaN	NaN	rentdirect
1	1	142	www.prescriptionfortime.com	2006-03-12 12:31:06	NaN	NaN	prescriptionfortim
2	2	142	staple.com	2006-03-17 21:19:29	NaN	NaN	stapl
3	3	142	staple.com	2006-03-17 21:19:45	NaN	NaN	stapl
4	4	142	www.newyorklawyersite.com	2006-03-18 08:02:58	NaN	NaN	newyorklawyersit

	Unnamed: 0	AnonID	Query	QueryTime	ItemRank	ClickURL	clean_Query	TIMESTAMP
1359555	1359555	3389643	www.uabhealthsystems.com	2006-03-01 00:01:53	NaN	NaN	uabhealthsystem	1.141200e+09
592221	592221	1499659	ohio medical card	2006-03-01 00:02:02	1.0	http://jfs.ohio.gov	ohio medic card	1.141200e+09
477281	477281	1196769	zip codes	2006-03-01 00:02:10	NaN	NaN	zip code	1.141200e+09
1359556	1359556	3389643	uabhealthsystems.com	2006-03-01 00:02:22	NaN	NaN	uabhealthsystem	1.141200e+09
896721	896721	2272416	theunorthodoxjew.blogspot.com	2006-03-01 00:03:04	1.0	http://theunorthodoxjew.blogspot.com	theunorthodoxjew blogspot	1.141200e+09
...	...	...	...	...	...	...	...	...
435971	435971	1059309	-	2006-05-31 23:59:52	NaN	NaN	NaN	1.149145e+09
530715	530715	1341422	-	2006-05-31 23:59:53	NaN	NaN	NaN	1.149145e+09
3519390	3519390	23467494	fk.com	2006-05-31 23:59:57	NaN	NaN	fk	1.149145e+09
2829973	2829973	11572479	longmont daily times call	2006-05-31 23:59:58	NaN	NaN	longmont daili time call	1.149145e+09
433600	433600	1048985	seattle bus schedule	2006-05-31 23:59:58	1.0	http://transit.metrokc.gov	seattl bus schedul	1.149145e+09

	Unnamed: 0	AnonID	Query	QueryTime	ItemRank	ClickURL	clean_Query	TIMESTAMP	IS_DAY_TIME	QUERY_HOUR
0	0	142	rentdirect.com	2006-03-01 07:17:12	NaN	NaN	rentdirect	1.141226e+09	True	7
1	1	142	www.prescriptionfortime.com	2006-03-12 12:31:06	NaN	NaN	prescriptionfortim	1.142192e+09	True	12
2	2	142	staple.com	2006-03-17 21:19:29	NaN	NaN	stapl	1.142656e+09	False	21
3	3	142	staple.com	2006-03-17 21:19:45	NaN	NaN	stapl	1.142656e+09	False	21
4	4	142	www.newyorklawyersite.com	2006-03-18 08:02:58	NaN	NaN	newyorklawyersit	1.142694e+09	True	8

	Unnamed: 0	AnonID	Query	QueryTime	ItemRank	ClickURL	clean_Query	TIMESTAMP	IS_DAY_TIME	QUERY_HOUR	EXPECTED_DAY_TIME
0	0	142	rentdirect.com	2006-03-01 07:17:12	NaN	NaN	rentdirect	1.141226e+09	True	7	1
1	1	142	www.prescriptionfortime.com	2006-03-12 12:31:06	NaN	NaN	prescriptionfortim	1.142192e+09	True	12	1
2	2	142	staple.com	2006-03-17 21:19:29	NaN	NaN	stapl	1.142656e+09	False	21	2
3	3	142	staple.com	2006-03-17 21:19:45	NaN	NaN	stapl	1.142656e+09	False	21	2
4	4	142	www.newyorklawyersite.com	2006-03-18 08:02:58	NaN	NaN	newyorklawyersit	1.142694e+09	True	8	1

	Unnamed: 0	AnonID	Query	QueryTime	ItemRank	ClickURL	clean_Query	TIMESTAMP	IS_DAY_TIME	QUERY_HOUR	EXPECTED_DAY_TIME	IS_COMPAT	TOTAL_FREQ
0	0	142	rentdirect.com	2006-03-01 07:17:12	NaN	NaN	rentdirect	1.141226e+09	True	7	1	True	1
1	1	142	www.prescriptionfortime.com	2006-03-12 12:31:06	NaN	NaN	prescriptionfortim	1.142192e+09	True	12	1	True	1
2	2	142	staple.com	2006-03-17 21:19:29	NaN	NaN	stapl	1.142656e+09	False	21	2	True	228
3	3	142	staple.com	2006-03-17 21:19:45	NaN	NaN	stapl	1.142656e+09	False	21	2	True	228
4	4	142	www.newyorklawyersite.com	2006-03-18 08:02:58	NaN	NaN	newyorklawyersit	1.142694e+09	True	8	1	True	2