{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "pycharm": { "is_executing": false } }, "outputs": [ { "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mcorpus_raw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'He is the king . The king is royal . She is the royal queen '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# convert to lower case\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mcorpus_raw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcorpus_raw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorflow'" ], "ename": "ModuleNotFoundError", "evalue": "No module named 'tensorflow'", "output_type": "error" } ], "source": [ "import numpy as np\n", "import tensorflow as tf\n", "corpus_raw = 'He is the king . The king is royal . She is the royal queen '\n", "# convert to lower case\n", "corpus_raw = corpus_raw.lower()\n", "#print(corpus_raw)\n", "words = []\n", "for word in corpus_raw.split():\n", " if word != '.': # because we don't want to treat . as a word\n", " words.append(word)\n", "words = set(words) # so that all duplicate words are removed\n", "word2int = {}\n", "int2word = {}\n", "vocab_size = len(words) # gives the total number of unique words\n", "for i,word in enumerate(words):\n", " word2int[word] = i\n", " int2word[i] = word\n", "#print(word2int['queen'])\n", "#print(int2word[5])\n", "\n", "# raw sentences is a list of sentences.\n", "raw_sentences = corpus_raw.split('.')\n", "sentences = []\n", "for sentence in raw_sentences:\n", " sentences.append(sentence.split())\n", "#print(sentences)\n", "data = []\n", "WINDOW_SIZE = 2\n", "for sentence in sentences:\n", " for word_index, word in enumerate(sentence):\n", " for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : \n", " if nb_word != word:\n", " data.append([word, nb_word])\n", "#print(data)\n", "\n", "# function to convert numbers to one hot vectors\n", "def to_one_hot(data_point_index, vocab_size):\n", " temp = np.zeros(vocab_size)\n", " temp[data_point_index] = 1\n", " return temp\n", "x_train = [] # input word\n", "y_train = [] # output word\n", "for data_word in data:\n", " x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))\n", " y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))\n", "# convert them to numpy arrays\n", "x_train = np.asarray(x_train)\n", "y_train = np.asarray(y_train)\n", "\n", "print(x_train.shape, y_train.shape)\n", "x = tf.placeholder(tf.float32, shape=(None, vocab_size))\n", "y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))\n", "\n", "EMBEDDING_DIM = 5 # you can choose your own number\n", "W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))\n", "b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias\n", "hidden_representation = tf.add(tf.matmul(x,W1), b1)\n", "W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))\n", "b2 = tf.Variable(tf.random_normal([vocab_size]))\n", "prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))\n", "\n", "sess = tf.Session()\n", "init = tf.global_variables_initializer()\n", "sess.run(init) #make sure you do this!\n", "# define the loss function:\n", "cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))\n", "# define the training step:\n", "train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)\n", "n_iters = 10000\n", "# train for n_iter iterations\n", "for _ in range(n_iters):\n", " sess.run(train_step, feed_dict={x: x_train, y_label: y_train})\n", " #print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))\n", "#print(sess.run(W1))\n", "#print('----------')\n", "#print(sess.run(b1))\n", "#print('----------')\n", "vectors = sess.run(W1 + b1)\n", "#print(vectors)\n", "print(vectors[ word2int['queen'] ])" ] } ], "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" }, "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 0 }