#given corpus
#corpus = ["this is doc1 ML this is doc1 ML", "is this right ML right", "yes seems great one ML great right"]
corpus = ["this is dog this is cat", "this is cat", "that is different", "he is different", "they are happy"]

query = ["this is query"]

#bag of words i.e. SET
bag_of_words = []

#stop words
stop_words = ["this", "is", "yes", "seems"]
#read each doc, tokenize and put it in bag of words
for d in corpus:
    for w in d.split():
        #if w not in stop_words:
            bag_of_words.append(w)

#make a set i.e. unique list of words i.e. final bag of words
bag_of_words = list(set(bag_of_words))
print bag_of_words

#make feature => index  map, which gives the index of value of the feature in vector
dimensions_index_map = {}
for i, w in enumerate(bag_of_words):
    dimensions_index_map[w] = i
#print dimensions_index_map

#make document vectors corpus
vector_corpus = []
#for d in query:
for d in corpus:
    v1 = [0] * len(bag_of_words)
    tokens = d.split()
    for w in bag_of_words:
        cnt = tokens.count(w)
        v1[dimensions_index_map[w]] = cnt
    vector_corpus.append(v1)
        
print "BAG OF WORDS MODEL VECTORIZED DOCS\n", vector_corpus