import pandas as pd
import numpy as np
df = pd.read_csv('data/cleanedData.csv', sep=",")
df.head()
Unnamed: 0 | AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | |
---|---|---|---|---|---|---|---|
0 | 0 | 142 | rentdirect.com | 2006-03-01 07:17:12 | NaN | NaN | rentdirect |
1 | 1 | 142 | www.prescriptionfortime.com | 2006-03-12 12:31:06 | NaN | NaN | prescriptionfortim |
2 | 2 | 142 | staple.com | 2006-03-17 21:19:29 | NaN | NaN | stapl |
3 | 3 | 142 | staple.com | 2006-03-17 21:19:45 | NaN | NaN | stapl |
4 | 4 | 142 | www.newyorklawyersite.com | 2006-03-18 08:02:58 | NaN | NaN | newyorklawyersit |
def add_timestamp(datetime_str):
datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
return datetime_object.timestamp()
df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
df.sort_values("TIMESTAMP")
Unnamed: 0 | AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | TIMESTAMP | |
---|---|---|---|---|---|---|---|---|
1359555 | 1359555 | 3389643 | www.uabhealthsystems.com | 2006-03-01 00:01:53 | NaN | NaN | uabhealthsystem | 1.141200e+09 |
592221 | 592221 | 1499659 | ohio medical card | 2006-03-01 00:02:02 | 1.0 | http://jfs.ohio.gov | ohio medic card | 1.141200e+09 |
477281 | 477281 | 1196769 | zip codes | 2006-03-01 00:02:10 | NaN | NaN | zip code | 1.141200e+09 |
1359556 | 1359556 | 3389643 | uabhealthsystems.com | 2006-03-01 00:02:22 | NaN | NaN | uabhealthsystem | 1.141200e+09 |
896721 | 896721 | 2272416 | theunorthodoxjew.blogspot.com | 2006-03-01 00:03:04 | 1.0 | http://theunorthodoxjew.blogspot.com | theunorthodoxjew blogspot | 1.141200e+09 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
435971 | 435971 | 1059309 | - | 2006-05-31 23:59:52 | NaN | NaN | NaN | 1.149145e+09 |
530715 | 530715 | 1341422 | - | 2006-05-31 23:59:53 | NaN | NaN | NaN | 1.149145e+09 |
3519390 | 3519390 | 23467494 | fk.com | 2006-05-31 23:59:57 | NaN | NaN | fk | 1.149145e+09 |
2829973 | 2829973 | 11572479 | longmont daily times call | 2006-05-31 23:59:58 | NaN | NaN | longmont daili time call | 1.149145e+09 |
433600 | 433600 | 1048985 | seattle bus schedule | 2006-05-31 23:59:58 | 1.0 | http://transit.metrokc.gov | seattl bus schedul | 1.149145e+09 |
3558411 rows × 8 columns
#df = df[:20000]
train_split = 0.5
size = df.shape[0]
train_size = int(size * train_split)
training_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]
df = training_df
import datetime
day_time_second_start = 7 * 3600
day_time_second_end = 19 * 3600
def is_day_time(datetime_str):
datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
seconds_of_day = datetime_object.hour * 3600 + datetime_object.minute * 60 + datetime_object.second
if day_time_second_start<= seconds_of_day and seconds_of_day <= day_time_second_end:
return True
else:
return False
df["IS_DAY_TIME"] = df['QueryTime'].apply(is_day_time)
def query_hour(datetime_str):
datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
return datetime_object.hour
df["QUERY_HOUR"] = df['QueryTime'].apply(query_hour)
df.head()
Unnamed: 0 | AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | TIMESTAMP | IS_DAY_TIME | QUERY_HOUR | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 142 | rentdirect.com | 2006-03-01 07:17:12 | NaN | NaN | rentdirect | 1.141226e+09 | True | 7 |
1 | 1 | 142 | www.prescriptionfortime.com | 2006-03-12 12:31:06 | NaN | NaN | prescriptionfortim | 1.142192e+09 | True | 12 |
2 | 2 | 142 | staple.com | 2006-03-17 21:19:29 | NaN | NaN | stapl | 1.142656e+09 | False | 21 |
3 | 3 | 142 | staple.com | 2006-03-17 21:19:45 | NaN | NaN | stapl | 1.142656e+09 | False | 21 |
4 | 4 | 142 | www.newyorklawyersite.com | 2006-03-18 08:02:58 | NaN | NaN | newyorklawyersit | 1.142694e+09 | True | 8 |
def assign_query_time(l):
length = len(l)
true_count = sum(l)
if true_count/length >= 0.8:
# is day_time
return 1
elif true_count/length <= 0.2:
# is night_time
return 0
else:
# is any_time
return 2
query_time_compatibility_expected = df.groupby('clean_Query').agg(assign_query_time)
query_time_compatibility_expected_dict = query_time_compatibility_expected.to_dict()
df = df[df['clean_Query'].notna()]
def populate_assigned_query_time(query):
return query_time_compatibility_expected_dict["IS_DAY_TIME"][query]
df["EXPECTED_DAY_TIME"] = df['clean_Query'].apply(populate_assigned_query_time)
df.head()
<ipython-input-96-c9c24ded9df4>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["EXPECTED_DAY_TIME"] = df['clean_Query'].apply(populate_assigned_query_time)
Unnamed: 0 | AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | TIMESTAMP | IS_DAY_TIME | QUERY_HOUR | EXPECTED_DAY_TIME | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 142 | rentdirect.com | 2006-03-01 07:17:12 | NaN | NaN | rentdirect | 1.141226e+09 | True | 7 | 1 |
1 | 1 | 142 | www.prescriptionfortime.com | 2006-03-12 12:31:06 | NaN | NaN | prescriptionfortim | 1.142192e+09 | True | 12 | 1 |
2 | 2 | 142 | staple.com | 2006-03-17 21:19:29 | NaN | NaN | stapl | 1.142656e+09 | False | 21 | 2 |
3 | 3 | 142 | staple.com | 2006-03-17 21:19:45 | NaN | NaN | stapl | 1.142656e+09 | False | 21 | 2 |
4 | 4 | 142 | www.newyorklawyersite.com | 2006-03-18 08:02:58 | NaN | NaN | newyorklawyersit | 1.142694e+09 | True | 8 | 1 |
def is_compatible(row):
if(row["EXPECTED_DAY_TIME"]==1 and not row["IS_DAY_TIME"]):
return False
if(row["EXPECTED_DAY_TIME"]==0 and row["IS_DAY_TIME"]):
return False
return True
df["IS_COMPAT"] = df.apply(is_compatible, axis = 1)
<ipython-input-97-247ccc1e5eec>:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["IS_COMPAT"] = df.apply(is_compatible, axis = 1)
query_freq = df['clean_Query'].value_counts().to_dict()
df["TOTAL_FREQ"] = df['clean_Query'].apply(lambda x : query_freq[x])
df.head()
<ipython-input-98-d482504428c0>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["TOTAL_FREQ"] = df['clean_Query'].apply(lambda x : query_freq[x])
Unnamed: 0 | AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | TIMESTAMP | IS_DAY_TIME | QUERY_HOUR | EXPECTED_DAY_TIME | IS_COMPAT | TOTAL_FREQ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 142 | rentdirect.com | 2006-03-01 07:17:12 | NaN | NaN | rentdirect | 1.141226e+09 | True | 7 | 1 | True | 1 |
1 | 1 | 142 | www.prescriptionfortime.com | 2006-03-12 12:31:06 | NaN | NaN | prescriptionfortim | 1.142192e+09 | True | 12 | 1 | True | 1 |
2 | 2 | 142 | staple.com | 2006-03-17 21:19:29 | NaN | NaN | stapl | 1.142656e+09 | False | 21 | 2 | True | 228 |
3 | 3 | 142 | staple.com | 2006-03-17 21:19:45 | NaN | NaN | stapl | 1.142656e+09 | False | 21 | 2 | True | 228 |
4 | 4 | 142 | www.newyorklawyersite.com | 2006-03-18 08:02:58 | NaN | NaN | newyorklawyersit | 1.142694e+09 | True | 8 | 1 | True | 2 |
def add_timestamp(datetime_str):
datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
return datetime_object.timestamp()
df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
df.head()
<ipython-input-99-d86c64f1d509>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["TIMESTAMP"] = df["QueryTime"].apply(add_timestamp)
Unnamed: 0 | AnonID | Query | QueryTime | ItemRank | ClickURL | clean_Query | TIMESTAMP | IS_DAY_TIME | QUERY_HOUR | EXPECTED_DAY_TIME | IS_COMPAT | TOTAL_FREQ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 142 | rentdirect.com | 2006-03-01 07:17:12 | NaN | NaN | rentdirect | 1.141226e+09 | True | 7 | 1 | True | 1 |
1 | 1 | 142 | www.prescriptionfortime.com | 2006-03-12 12:31:06 | NaN | NaN | prescriptionfortim | 1.142192e+09 | True | 12 | 1 | True | 1 |
2 | 2 | 142 | staple.com | 2006-03-17 21:19:29 | NaN | NaN | stapl | 1.142656e+09 | False | 21 | 2 | True | 228 |
3 | 3 | 142 | staple.com | 2006-03-17 21:19:45 | NaN | NaN | stapl | 1.142656e+09 | False | 21 | 2 | True | 228 |
4 | 4 | 142 | www.newyorklawyersite.com | 2006-03-18 08:02:58 | NaN | NaN | newyorklawyersit | 1.142694e+09 | True | 8 | 1 | True | 2 |
day_time_second_start = 7 * 3600
day_time_second_end = 19 * 3600
def agg_all_query_hour(l):
day = 0
night = 0
for datetime_str in l:
datetime_object = datetime.datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
seconds_of_day = datetime_object.hour * 3600 + datetime_object.minute * 60 + datetime_object.second
if day_time_second_start<= seconds_of_day and seconds_of_day <= day_time_second_end:
day += 1
else:
night += 1
NT_FREQ_DAY = day/12
NT_FREQ_NIGHT = night/12
NT_FREQ24 = (day+night)/24
return (NT_FREQ24, NT_FREQ_DAY, NT_FREQ_NIGHT);
query_all_timestamps = df.groupby('clean_Query')[["QueryTime"]].agg(agg_all_query_hour)
query_all_timestamps_dict = query_all_timestamps.to_dict()
df_q = pd.DataFrame.from_dict(query_all_timestamps_dict.get('QueryTime'),orient='index')
df_q_sorted0 = df_q.sort_values(0, ascending=False).reset_index(level=0)
df_q_sorted1 = df_q.sort_values(1, ascending=False).reset_index(level=0)
df_q_sorted2 = df_q.sort_values(2, ascending=False).reset_index(level=0)
l0 = list(df_q_sorted0['index'])
l1 = list(df_q_sorted1['index'])
l2 = list(df_q_sorted2['index'])
s_size = 50
ss_size = 30
s_list = list(df_q_sorted0[:s_size]['index'])
print(s_list)
['googl', 'yahoo', 'ebay', 'myspac', 'mapquest', 'space', 'hotmail', 'pogo', 'msn', 'craigslist', 'http', 'bankofamerica', 'map quest', 'weather', 'internet', 'dictionari', 'yahoo mail', 'american idol', 'horoscop', 'bank america', 'walmart', 'ask', 'ask jeev', 'n e evergreen solar', 'target', 'amazon', 'ir', 'tattoo', 'map', 'sexi swimwear', 'southwest airlin', 'lyric', 'qvc', 'low', 'porn', 'orbitz', 'fidel', 'yahoo person', 'verizon', 'home depot', 'sex', 'expedia', 'cnn', 'disney channel', 'sear', 'sin midnight kimber logan', 'donni mcclurkin', 'wachovia', 'eharmoni', 'traveloc']
ss_day_list = []
count = 0
for q in l1:
if q in s_list:
continue
ss_day_list.append(q)
if len(ss_day_list)==ss_size:
break
print(ss_day_list)
['yellow page', 'clive owen', 'amex bema gold', 'sale rep china product', 'kimber logan', 'nyse wit', 'craig list', 'amex bgo', 'monster', 'ticketmast', 'nick', 'blackplanet', 'nyse hl', 'white page', 'usp', 'cartoonetwork', 'donbest', 'nyse eslr', 'nascar', 'hsn', 'cingular', 'runescap', 'adserv ign', 'up', 'best buy', 'n e wipro', 'dogpil', 'realtor', 'match', 'mycl cravelyr']
ss_night_list = []
count = 0
for q in l2:
if q in s_list:
continue
ss_night_list.append(q)
if len(ss_night_list)==ss_size:
break
print(ss_night_list)
['miami heat', 'myspac layout', 'mycl cravelyr', 'daniel pipe', 'swimwear', 'adserv ign', 'gold', 'donni mcclurkin cd', 'mailbox', 'nbc', 'realtor', 'yahoo game', 'hilari duff', 'american airlin', 'gl ch section relat wetland protect act', 'limewir', 'uproar', 'deal deal', 'pink world', 'donbest', 'cingular', 'raven riley', 'profileedit myspac', 'goggl', 'sheer swimwear', 'runescap', 'espn', 'match', 'slingo', 'n']
import pickle
with open('data/all_list.pkl', 'wb') as f:
pickle.dump(l0, f)
with open('data/day_list.pkl', 'wb') as f:
pickle.dump(l1, f)
with open('data/night_list.pkl', 'wb') as f:
pickle.dump(l2, f)
df.to_csv('data/ssd_train_large.csv', sep=',', encoding='utf-8')
test_df.to_csv('data/ssd_test_large.csv', sep=',', encoding='utf-8')