In [88]:
import pandas as pd
import numpy as np
In [89]:
df = pd.read_csv('data/cleanedData.csv', sep=",")
df.head()
Out[89]:
Unnamed: 0 AnonID Query QueryTime ItemRank ClickURL clean_Query
0 0 142 rentdirect.com 2006-03-01 07:17:12 NaN NaN rentdirect
1 1 142 www.prescriptionfortime.com 2006-03-12 12:31:06 NaN NaN prescriptionfortim
2 2 142 staple.com 2006-03-17 21:19:29 NaN NaN stapl
3 3 142 staple.com 2006-03-17 21:19:45 NaN NaN stapl
4 4 142 www.newyorklawyersite.com 2006-03-18 08:02:58 NaN NaN newyorklawyersit
In [90]:
def add_timestamp(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    return datetime_object.timestamp()
df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
df.sort_values("TIMESTAMP")
Out[90]:
Unnamed: 0 AnonID Query QueryTime ItemRank ClickURL clean_Query TIMESTAMP
1359555 1359555 3389643 www.uabhealthsystems.com 2006-03-01 00:01:53 NaN NaN uabhealthsystem 1.141200e+09
592221 592221 1499659 ohio medical card 2006-03-01 00:02:02 1.0 http://jfs.ohio.gov ohio medic card 1.141200e+09
477281 477281 1196769 zip codes 2006-03-01 00:02:10 NaN NaN zip code 1.141200e+09
1359556 1359556 3389643 uabhealthsystems.com 2006-03-01 00:02:22 NaN NaN uabhealthsystem 1.141200e+09
896721 896721 2272416 theunorthodoxjew.blogspot.com 2006-03-01 00:03:04 1.0 http://theunorthodoxjew.blogspot.com theunorthodoxjew blogspot 1.141200e+09
... ... ... ... ... ... ... ... ...
435971 435971 1059309 - 2006-05-31 23:59:52 NaN NaN NaN 1.149145e+09
530715 530715 1341422 - 2006-05-31 23:59:53 NaN NaN NaN 1.149145e+09
3519390 3519390 23467494 fk.com 2006-05-31 23:59:57 NaN NaN fk 1.149145e+09
2829973 2829973 11572479 longmont daily times call 2006-05-31 23:59:58 NaN NaN longmont daili time call 1.149145e+09
433600 433600 1048985 seattle bus schedule 2006-05-31 23:59:58 1.0 http://transit.metrokc.gov seattl bus schedul 1.149145e+09

3558411 rows × 8 columns

In [92]:
#df = df[:20000]
train_split = 0.5
size = df.shape[0]
train_size = int(size * train_split)
training_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]
df = training_df
In [93]:
import datetime
day_time_second_start = 7 * 3600
day_time_second_end = 19 * 3600
def is_day_time(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    seconds_of_day = datetime_object.hour * 3600 + datetime_object.minute * 60 + datetime_object.second
    if day_time_second_start<= seconds_of_day and seconds_of_day <= day_time_second_end:
        return True
    else:
        return False
df["IS_DAY_TIME"] = df['QueryTime'].apply(is_day_time)
In [94]:
def query_hour(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    return datetime_object.hour
df["QUERY_HOUR"] = df['QueryTime'].apply(query_hour)
df.head()
Out[94]:
Unnamed: 0 AnonID Query QueryTime ItemRank ClickURL clean_Query TIMESTAMP IS_DAY_TIME QUERY_HOUR
0 0 142 rentdirect.com 2006-03-01 07:17:12 NaN NaN rentdirect 1.141226e+09 True 7
1 1 142 www.prescriptionfortime.com 2006-03-12 12:31:06 NaN NaN prescriptionfortim 1.142192e+09 True 12
2 2 142 staple.com 2006-03-17 21:19:29 NaN NaN stapl 1.142656e+09 False 21
3 3 142 staple.com 2006-03-17 21:19:45 NaN NaN stapl 1.142656e+09 False 21
4 4 142 www.newyorklawyersite.com 2006-03-18 08:02:58 NaN NaN newyorklawyersit 1.142694e+09 True 8
In [95]:
def assign_query_time(l):
    length = len(l)
    true_count = sum(l)
    if true_count/length >= 0.8:
        # is day_time
        return 1
    elif true_count/length <= 0.2:
        # is night_time
        return 0
    else:
        # is any_time
        return 2
query_time_compatibility_expected = df.groupby('clean_Query').agg(assign_query_time)
query_time_compatibility_expected_dict = query_time_compatibility_expected.to_dict()
In [96]:
df = df[df['clean_Query'].notna()]
def populate_assigned_query_time(query):
    return query_time_compatibility_expected_dict["IS_DAY_TIME"][query]
df["EXPECTED_DAY_TIME"] = df['clean_Query'].apply(populate_assigned_query_time)
df.head()
<ipython-input-96-c9c24ded9df4>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["EXPECTED_DAY_TIME"] = df['clean_Query'].apply(populate_assigned_query_time)
Out[96]:
Unnamed: 0 AnonID Query QueryTime ItemRank ClickURL clean_Query TIMESTAMP IS_DAY_TIME QUERY_HOUR EXPECTED_DAY_TIME
0 0 142 rentdirect.com 2006-03-01 07:17:12 NaN NaN rentdirect 1.141226e+09 True 7 1
1 1 142 www.prescriptionfortime.com 2006-03-12 12:31:06 NaN NaN prescriptionfortim 1.142192e+09 True 12 1
2 2 142 staple.com 2006-03-17 21:19:29 NaN NaN stapl 1.142656e+09 False 21 2
3 3 142 staple.com 2006-03-17 21:19:45 NaN NaN stapl 1.142656e+09 False 21 2
4 4 142 www.newyorklawyersite.com 2006-03-18 08:02:58 NaN NaN newyorklawyersit 1.142694e+09 True 8 1
In [97]:
def is_compatible(row):
    if(row["EXPECTED_DAY_TIME"]==1 and not row["IS_DAY_TIME"]):
        return False
    if(row["EXPECTED_DAY_TIME"]==0 and row["IS_DAY_TIME"]):
        return False
    return True
df["IS_COMPAT"] = df.apply(is_compatible, axis = 1)
<ipython-input-97-247ccc1e5eec>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["IS_COMPAT"] = df.apply(is_compatible, axis = 1)
In [98]:
query_freq = df['clean_Query'].value_counts().to_dict()
df["TOTAL_FREQ"] = df['clean_Query'].apply(lambda x : query_freq[x])
df.head()
<ipython-input-98-d482504428c0>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TOTAL_FREQ"] = df['clean_Query'].apply(lambda x : query_freq[x])
Out[98]:
Unnamed: 0 AnonID Query QueryTime ItemRank ClickURL clean_Query TIMESTAMP IS_DAY_TIME QUERY_HOUR EXPECTED_DAY_TIME IS_COMPAT TOTAL_FREQ
0 0 142 rentdirect.com 2006-03-01 07:17:12 NaN NaN rentdirect 1.141226e+09 True 7 1 True 1
1 1 142 www.prescriptionfortime.com 2006-03-12 12:31:06 NaN NaN prescriptionfortim 1.142192e+09 True 12 1 True 1
2 2 142 staple.com 2006-03-17 21:19:29 NaN NaN stapl 1.142656e+09 False 21 2 True 228
3 3 142 staple.com 2006-03-17 21:19:45 NaN NaN stapl 1.142656e+09 False 21 2 True 228
4 4 142 www.newyorklawyersite.com 2006-03-18 08:02:58 NaN NaN newyorklawyersit 1.142694e+09 True 8 1 True 2
In [99]:
def add_timestamp(datetime_str):
    datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    return datetime_object.timestamp()
df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
df.head()
<ipython-input-99-d86c64f1d509>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
Out[99]:
Unnamed: 0 AnonID Query QueryTime ItemRank ClickURL clean_Query TIMESTAMP IS_DAY_TIME QUERY_HOUR EXPECTED_DAY_TIME IS_COMPAT TOTAL_FREQ
0 0 142 rentdirect.com 2006-03-01 07:17:12 NaN NaN rentdirect 1.141226e+09 True 7 1 True 1
1 1 142 www.prescriptionfortime.com 2006-03-12 12:31:06 NaN NaN prescriptionfortim 1.142192e+09 True 12 1 True 1
2 2 142 staple.com 2006-03-17 21:19:29 NaN NaN stapl 1.142656e+09 False 21 2 True 228
3 3 142 staple.com 2006-03-17 21:19:45 NaN NaN stapl 1.142656e+09 False 21 2 True 228
4 4 142 www.newyorklawyersite.com 2006-03-18 08:02:58 NaN NaN newyorklawyersit 1.142694e+09 True 8 1 True 2
In [100]:
day_time_second_start = 7 * 3600
day_time_second_end = 19 * 3600
def agg_all_query_hour(l):
    day = 0
    night = 0
    for datetime_str in l:
        datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
        seconds_of_day = datetime_object.hour * 3600 + datetime_object.minute * 60 + datetime_object.second
        if day_time_second_start<= seconds_of_day and seconds_of_day <= day_time_second_end:
            day += 1
        else:
            night += 1
    NT_FREQ_DAY = day/12
    NT_FREQ_NIGHT = night/12
    NT_FREQ24 = (day+night)/24
    return (NT_FREQ24, NT_FREQ_DAY, NT_FREQ_NIGHT);
query_all_timestamps = df.groupby('clean_Query')[["QueryTime"]].agg(agg_all_query_hour)
query_all_timestamps_dict = query_all_timestamps.to_dict()
In [101]:
df_q = pd.DataFrame.from_dict(query_all_timestamps_dict.get('QueryTime'),orient='index')
In [103]:
df_q_sorted0 = df_q.sort_values(0, ascending=False).reset_index(level=0)
df_q_sorted1 = df_q.sort_values(1, ascending=False).reset_index(level=0)
df_q_sorted2 = df_q.sort_values(2, ascending=False).reset_index(level=0)
In [104]:
l0 = list(df_q_sorted0['index'])
l1  = list(df_q_sorted1['index'])
l2 = list(df_q_sorted2['index'])
In [105]:
s_size = 50
ss_size = 30
In [106]:
s_list = list(df_q_sorted0[:s_size]['index'])
print(s_list)
['googl', 'yahoo', 'ebay', 'myspac', 'mapquest', 'space', 'hotmail', 'pogo', 'msn', 'craigslist', 'http', 'bankofamerica', 'map quest', 'weather', 'internet', 'dictionari', 'yahoo mail', 'american idol', 'horoscop', 'bank america', 'walmart', 'ask', 'ask jeev', 'n e evergreen solar', 'target', 'amazon', 'ir', 'tattoo', 'map', 'sexi swimwear', 'southwest airlin', 'lyric', 'qvc', 'low', 'porn', 'orbitz', 'fidel', 'yahoo person', 'verizon', 'home depot', 'sex', 'expedia', 'cnn', 'disney channel', 'sear', 'sin midnight kimber logan', 'donni mcclurkin', 'wachovia', 'eharmoni', 'traveloc']
In [107]:
ss_day_list = []
count = 0
for q in l1:
    if q in s_list:
        continue
    ss_day_list.append(q)
    if len(ss_day_list)==ss_size:
        break
print(ss_day_list)
['yellow page', 'clive owen', 'amex bema gold', 'sale rep china product', 'kimber logan', 'nyse wit', 'craig list', 'amex bgo', 'monster', 'ticketmast', 'nick', 'blackplanet', 'nyse hl', 'white page', 'usp', 'cartoonetwork', 'donbest', 'nyse eslr', 'nascar', 'hsn', 'cingular', 'runescap', 'adserv ign', 'up', 'best buy', 'n e wipro', 'dogpil', 'realtor', 'match', 'mycl cravelyr']
In [108]:
ss_night_list = []
count = 0
for q in l2:
    if q in s_list:
        continue
    ss_night_list.append(q)
    if len(ss_night_list)==ss_size:
        break
print(ss_night_list)
['miami heat', 'myspac layout', 'mycl cravelyr', 'daniel pipe', 'swimwear', 'adserv ign', 'gold', 'donni mcclurkin cd', 'mailbox', 'nbc', 'realtor', 'yahoo game', 'hilari duff', 'american airlin', 'gl ch section relat wetland protect act', 'limewir', 'uproar', 'deal deal', 'pink world', 'donbest', 'cingular', 'raven riley', 'profileedit myspac', 'goggl', 'sheer swimwear', 'runescap', 'espn', 'match', 'slingo', 'n']
In [109]:
import pickle
with open('data/all_list.pkl', 'wb') as f:
    pickle.dump(l0, f)
with open('data/day_list.pkl', 'wb') as f:
    pickle.dump(l1, f)
with open('data/night_list.pkl', 'wb') as f:
    pickle.dump(l2, f)
In [110]:
df.to_csv('data/ssd_train_large.csv', sep=',', encoding='utf-8')
test_df.to_csv('data/ssd_test_large.csv', sep=',', encoding='utf-8')