实现 Word2Vec

In [1]:
from __future__ import division
import collections
import math
import os
import random
import zipfile
import numpy as np
import tensorflow as tf

使用 urllib.urlretrieve 下载数据的压缩文件,并核对尺寸,如果已经下载了文件则跳过。

In [2]:
import urllib
def maybe_download(filename, expected_bytes):
    if not os.path.exists(filename):
        filename, _ = urllib.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)
('Found and verified', 'text8.zip')

在浏览器地址栏输入 http://mattmahoney.net/dc/text8.zip 下载数据的压缩文件。

接下来解压下载的压缩文件,并使用 tf.compat.as_str 将数据转成单词的列表。

In [3]:
# 将词存入 word 列表中
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print 'Data size', len(words)
Data size 17005207

通过输出知道数据最后被转为了一个包含 17005207 个单词的列表。

In [4]:
vocabulary_size = 50000  # 将出现频率最高的 50000 个单词放入 count 列表中,然后放入 dicionary 中


def build_dataset(words):
    count = [['UNK', -1]]  # 前面是词汇,后面是出现的次数,这里的 -1 在下面会填上 UNK 出现的频数
    # 将出现频次最高的 50000 个词存入count
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))  # -1 因为 UNK 已经占了一个了

    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    '''
    等价于,就是按 count 中词出现的顺序,分别给他们编号:0 1 2 ...
        for i in vocabulary_size:
            dictionary[count[i][0]]=i
    '''
    # 编码:如果不出现在 dictionary 中,就以 0 作为编号,否则以 dictionary 中的编号编号   
    # 也就是将 words 中的所有词的编号存在 data 中,顺带查一下 UNK 有多少个,以便替换 count 中的 -1
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)

    count[0][1] = unk_count

    # 编号:词
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary


data, count, dictionary, reverse_dictionary = build_dataset(words)

words[:10]

输出: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']

data[:10]

输出: [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156]

count[:10]

输出: [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]

dictionary # 词:编号

输出: {'fawn': 45848, 'homomorphism': 9648, 'nordisk': 39343, 'nunnery': 36075, 'chthonic': 33554, 'sowell': 40562, 'sonja': 38175, 'showa': 32906, 'woods': 6263, 'hsv': 44222, 'spiders': 14623, 'hanging': 8021, 'woody': 11150, ... }

dictionary['UNK']

输出: 0

dictionary['a']

输出: 6

reverse_dictionary # 编号:词

输出: {0: 'UNK', 1: 'the', 2: 'of', 3: 'and', 4: 'one', 5: 'in', 6: 'a', 7: 'to', 8: 'zero', ... }

In [5]:
del words  # 删除原始单词表,节约内存
print 'Most common word (+UNK)', count[:5]  # 打印最高频词汇及其出现次数(包括Unknow词汇)
print 'Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]  # 前10个单词编码、单词
Most common word (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
In [6]:
# 生成 Word2Vec 训练样本
data_index = 0


def generate_batch(batch_size, num_skips, skip_window):
    global data_index  # 设为global 因为会反复 generate
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    # 将 batch 和 labels 初始化为数组
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    # 对某个单词创建相关样本时会使用到的单词数量,包括目标单词本身和它前后的单词
    span = 2 * skip_window + 1

    # 创建最大容量为 span 的 deque(双向队列)
    # 在用 append 对 deque 添加变量时,只会保留最后插入的 span 个变量
    buffer = collections.deque(maxlen=span)

    # 从 data_index 开始,把 span 个单词顺序读入 buffer 作为初始值,buffer 中存的是词的编号
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # buffer 容量是 span,所以此时 buffer 已经填满,后续的数据将替换掉前面的数据

    # 每次循环内对一个目标单词生成样本,前方已经断言能整除,这里使用 // 是为了保证结果是 int
    for i in range(batch_size // num_skips):  # //除法只保留结果整数部分(python3中),python2中直接 /
        # 现在 buffer 中是目标单词和所有相关单词
        target = skip_window  # buffer 中第 skip_window 个单词为目标单词(注意第一个目标单词是 buffer[skip_window],并不是 buffer[0])
        targets_to_avoid = [skip_window]  # 接下来生成相关(上下文语境)单词,应将目标单词拒绝

        # 每次循环对一个语境单词生成样本
        for j in range(num_skips):
            # 先产生一个随机数,直到随机数不在 targets_to_avoid 中,就可以将之作为语境单词
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)  # 因为这个语境单词被使用了,所以要加入到 targets_to_avoid

            batch[i * num_skips + j] = buffer[skip_window]  # feature 是目标词汇
            labels[i * num_skips + j, 0] = buffer[target]  # label 是 buffer[target]

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

调用 generate_batch 函数测试一下功能。

In [7]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]
3084 originated -> 12 as
3084 originated -> 5239 anarchism
12 as -> 6 a
12 as -> 3084 originated
6 a -> 195 term
6 a -> 12 as
195 term -> 2 of
195 term -> 6 a
In [8]:
# 训练需要的参数
batch_size = 128
embedding_size = 128  # 将单词转为稠密向量的维度,一般是500~1000这个范围内的值,这里设为128
skip_window = 1  # 单词间最远可以联系到的距离
num_skips = 2  # 对每个目标单词提取的样本数

# 生成验证数据,随机抽取一些频数最高的单词,看向量空间上跟它们距离最近的单词是否相关性比较高
valid_size = 16  # 抽取的验证单词数
valid_window = 100  # 验证单词只从频数最高的 100 个单词中抽取
valid_examples = np.random.choice(valid_window, valid_size, replace=False)  # 随机抽取
num_sampled = 64  # 训练时用来做负样本的噪声单词的数量
In [9]:
graph = tf.Graph()
with graph.as_default():
    # 建立输入占位符
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)  # 将前面随机产生的 valid_examples 转为 TensorFlow 中的 constant

    with tf.device('/cpu:0'):  # 限定所有计算在 CPU 上执行
        # 随机生成所有单词的词向量 embeddings,单词表大小 5000,向量维度 128
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        # 查找 train_inputs 对应的向量 embed
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # 使用 NCE Loss 作为训练的优化目标
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
        nce_bias = tf.Variable(tf.zeros([vocabulary_size]))

    # 使用 tf.nn.nce_loss 计算学习出的词向量 embed 在训练数据上的 loss,并使用 tf.reduce_mean 进行汇总   
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights, biases=nce_bias, labels=train_labels, inputs=embed, num_sampled=num_sampled,
                       num_classes=vocabulary_size))

    # 定义优化器为 SGD,且学习速率为 1.0
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # 计算嵌入向量 embeddings 的 L2 范数 norm
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    # 标准化
    normalized_embeddings = embeddings / norm
    # 查询验证单词的嵌入向量,并计算验证单词的嵌入向量与词汇表中所有单词的相似性
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # 初始化所有模型参数
    init = tf.global_variables_initializer()
In [ ]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    init.run()
    print 'Initialized'

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print 'Average loss at step {} : {}'.format(step, average_loss)
            average_loss = 0

        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to {} :'.format(valid_word)

                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '{} {},'.format(log_str, close_word)
                print log_str
        final_embeddings = normalized_embeddings.eval()
Initialized
Average loss at step 0 : 253.554870605
Nearest to use : flyers, unborn, comprehended, sega, eliminating, prophetic, nano, empties,
Nearest to its : commandments, natura, eon, bed, timur, molyneux, evangelicalism, chaco,
Nearest to seven : immortalized, wenham, unclaimed, typewriters, af, counterbalanced, mansion, roommates,
Nearest to there : testosterone, majdanek, usage, numerology, sulphur, triad, bad, cherry,
Nearest to by : unsigned, salinger, sitter, karaca, opel, undeniably, portugal, et,
Nearest to to : amp, snowflakes, melatonin, sideways, quarrels, milhaud, isla, tonle,
Nearest to all : ontology, eraserhead, griffey, curses, listens, howstuffworks, polar, mexicana,
Nearest to first : blow, rangoon, volution, crimp, compostela, manchuria, subspecies, sixteen,
Nearest to history : shaman, alvar, russians, blaxploitation, numeration, halfdan, duckling, wondered,
Nearest to at : kbe, telegraphic, cats, uri, generational, baseline, regression, maoists,
Nearest to which : alvarez, magee, dios, sojourn, rabbani, xenon, inductively, flask,
Nearest to see : cubes, considers, heero, upto, germ, ah, tattoo, whipped,
Nearest to are : maois, chute, bionic, kept, unguided, exacted, panth, mieszko,
Nearest to five : arithmetica, blocs, conflicted, miracle, expounded, disband, hakim, derwent,
Nearest to often : ethnic, ltte, adria, fortieth, cad, mend, tuileries, involvement,
Nearest to may : waza, pashtun, oxygenated, ulcers, tragedians, rehearsals, outgoing, edom,
Average loss at step 2000 : 113.060384287
Average loss at step 4000 : 53.0690288844
Average loss at step 6000 : 33.2609856217
Average loss at step 8000 : 23.0952055303
Average loss at step 10000 : 18.2373447652
Nearest to use : vs, victoriae, baseball, vector, opiate, alpina, injured, reginae,
Nearest to its : commandments, the, bed, reginae, akira, decades, victoriae, a,
Nearest to seven : nine, zero, cl, vs, mathbf, reginae, explicitly, analogue,
Nearest to there : bad, gland, potato, usage, renovate, november, it, structures,
Nearest to by : and, in, as, austin, UNK, was, nine, are,
Nearest to to : and, with, in, for, austin, vs, cl, one,
Nearest to all : gland, implicit, polar, ontology, order, ancient, grapheme, cat,
Nearest to first : infections, vs, latitude, blow, best, public, mcclellan, one,
Nearest to history : russians, alpina, saul, kurtz, victoriae, bce, tickets, pervasive,
Nearest to at : cl, two, on, bucks, austin, of, kbe, presumably,
Nearest to which : helicopters, tuberculosis, flask, comparative, never, agricultural, litigation, burgeoning,
Nearest to see : canaris, centuries, gland, considers, ah, fan, blend, equipped,
Nearest to are : is, by, and, bionic, constellations, gland, kept, were,
Nearest to five : reginae, alpina, victoriae, nine, gland, three, zero, cl,
Nearest to often : ethnic, accordion, balance, published, dreyfus, bang, austin, alchemy,
Nearest to may : waza, vs, gogh, hannah, substantially, assesses, pashtun, bonding,
Average loss at step 12000 : 14.3100124947
Average loss at step 14000 : 11.8394078088
Average loss at step 16000 : 9.94490135089
Average loss at step 18000 : 8.49552821398
Average loss at step 20000 : 8.02561856115
Nearest to use : unborn, velar, victoriae, vs, dasyprocta, opiate, vector, prophetic,
Nearest to its : the, his, their, bed, metis, decades, commandments, moravia,
Nearest to seven : nine, zero, five, eight, two, agouti, four, three,
Nearest to there : it, backus, he, gland, dasyprocta, bad, potato, they,
Nearest to by : in, was, as, with, from, for, is, dasyprocta,
Nearest to to : for, would, and, vs, valentinians, dasyprocta, agouti, in,
Nearest to all : subkey, gland, order, dasyprocta, ontology, polar, journalists, ancient,
Nearest to first : rangoon, best, brady, agouti, amman, public, blow, vs,
Nearest to history : alvar, saul, russians, tickets, alphorn, alpina, kurtz, pervasive,
Nearest to at : in, on, agouti, cl, and, of, bodybuilding, two,
Nearest to which : that, never, also, and, dasyprocta, metis, it, comparative,
Nearest to see : canaris, is, considers, ah, fan, blend, centuries, dictionaries,
Nearest to are : is, were, was, by, in, gland, mg, asher,
Nearest to five : zero, eight, nine, seven, three, six, two, dasyprocta,
Nearest to often : mattingly, dasyprocta, now, accordion, ethnic, mats, balance, io,
Nearest to may : waza, tragedians, dasyprocta, eight, three, gogh, nur, would,
Average loss at step 22000 : 7.2253218497
Average loss at step 24000 : 6.99454965758
Average loss at step 26000 : 6.6251022315
Average loss at step 28000 : 6.20031205404
Average loss at step 30000 : 6.14926322377
Nearest to use : unborn, velar, victoriae, dasyprocta, vs, prophetic, azimuth, opiate,
Nearest to its : the, their, his, a, bed, moravia, decades, metis,
Nearest to seven : eight, five, nine, four, six, three, zero, two,
Nearest to there : it, they, he, still, dasyprocta, she, gland, potato,
Nearest to by : was, in, as, with, from, for, and, is,
Nearest to to : for, would, in, from, can, eight, nine, and,
Nearest to all : subkey, gland, abitibi, dasyprocta, ontology, journalists, order, polar,
Nearest to first : best, rangoon, agouti, brady, amman, infections, vs, blow,
Nearest to history : shaman, saul, tickets, alvar, russians, alphorn, alpina, kurtz,
Nearest to at : in, on, and, agouti, with, for, cl, dasyprocta,
Nearest to which : that, also, it, never, and, dasyprocta, litigation, this,
Nearest to see : is, canaris, akita, considers, cubes, blend, ah, centuries,
Nearest to are : were, is, have, was, by, gland, be, asterism,
Nearest to five : eight, four, six, seven, three, zero, nine, two,
Nearest to often : now, mattingly, mats, dasyprocta, salinas, shab, a, also,
Nearest to may : can, would, waza, eight, tragedians, dasyprocta, nine, to,
Average loss at step 32000 : 5.87546185231
Average loss at step 34000 : 5.8339735496
Average loss at step 36000 : 5.73484470379
Average loss at step 38000 : 5.28538503897
Average loss at step 40000 : 5.47259349346
Nearest to use : victoriae, dasyprocta, velar, vs, unborn, llama, opiate, reginae,
Nearest to its : their, the, his, some, bed, harlan, a, metis,
Nearest to seven : eight, five, six, four, nine, three, zero, one,
Nearest to there : it, they, he, still, and, handler, she, not,
Nearest to by : was, with, in, be, from, as, is, were,
Nearest to to : would, imprint, for, nine, can, from, will, dasyprocta,
Nearest to all : gland, subkey, abitibi, dasyprocta, journalists, many, two, agouti,
Nearest to first : best, rangoon, agouti, brady, blow, hora, infections, defend,
Nearest to history : shaman, saul, tickets, alvar, recitative, alpina, blaxploitation, russians,
Nearest to at : in, on, agouti, with, dasyprocta, cl, from, and,
Nearest to which : that, also, it, this, but, albury, never, and,
Nearest to see : canaris, cubes, akita, considers, is, referenced, ah, blend,
Nearest to are : were, is, have, was, gland, be, asterism, by,
Nearest to five : four, six, seven, three, eight, zero, nine, two,
Nearest to often : now, mats, also, mattingly, dasyprocta, it, generally, who,
Nearest to may : can, would, waza, hydrothermal, dasyprocta, tragedians, must, three,
Average loss at step 42000 : 5.30186149359
Average loss at step 44000 : 5.29712139511
Average loss at step 46000 : 5.2605512054
Average loss at step 48000 : 5.02441073215
Average loss at step 50000 : 5.16865433538
Nearest to use : beeb, victoriae, dasyprocta, velar, perfective, vs, unborn, opiate,
Nearest to its : their, his, the, harlan, bed, moravia, reginae, metis,
Nearest to seven : eight, six, five, nine, four, three, zero, one,
Nearest to there : it, they, he, still, she, dasyprocta, triad, handler,
Nearest to by : was, be, with, from, as, were, in, for,
Nearest to to : would, will, imprint, nine, bouldering, valentinians, albury, can,
Nearest to all : abitibi, gland, subkey, dasyprocta, many, journalists, agouti, ontology,
Nearest to first : best, brady, last, agouti, in, hora, backus, defend,
Nearest to history : shaman, alvar, tickets, saul, recitative, dipyramid, russians, alphorn,
Nearest to at : in, on, prism, agouti, with, and, dasyprocta, agni,
Nearest to which : that, this, also, it, but, albury, one, never,
Nearest to see : canaris, cubes, is, akita, referenced, asterism, maud, blend,
Nearest to are : were, is, have, gland, be, was, do, including,
Nearest to five : four, six, three, seven, eight, zero, nine, two,
Nearest to often : now, also, mats, generally, which, jen, mattingly, naaman,
Nearest to may : can, would, must, will, should, could, hydrothermal, waza,
Average loss at step 52000 : 5.16755377603
Average loss at step 54000 : 5.11729208422
Average loss at step 56000 : 5.05119970107
Average loss at step 58000 : 5.08750091422
Average loss at step 60000 : 4.93481853378
Nearest to use : beeb, victoriae, dasyprocta, marmoset, perfective, callithrix, velar, llama,
Nearest to its : their, his, the, her, some, metis, ssbn, reginae,
Nearest to seven : eight, six, five, nine, four, three, zero, two,
Nearest to there : they, it, he, still, she, handler, not, triad,
Nearest to by : was, be, as, with, were, under, naaman, wct,
Nearest to to : would, imprint, nine, bouldering, dasyprocta, will, for, vs,
Nearest to all : many, gland, abitibi, subkey, these, dasyprocta, some, agouti,
Nearest to first : best, last, agouti, microsite, hora, brady, mcclellan, rangoon,
Nearest to history : microcebus, michelob, wct, recitative, cebus, alpina, dasyprocta, tickets,
Nearest to at : in, on, cebus, with, prism, under, agouti, callithrix,
Nearest to which : that, this, also, it, but, callithrix, albury, one,
Nearest to see : canaris, akita, cubes, asterism, ssbn, referenced, prism, tamarin,
Nearest to are : were, is, have, be, gland, do, including, ssbn,
Nearest to five : four, six, three, eight, seven, nine, zero, two,
Nearest to often : now, also, generally, wct, mats, jen, which, dasyprocta,
Nearest to may : can, would, must, will, could, should, hydrothermal, waza,
Average loss at step 62000 : 4.78494637048
Average loss at step 64000 : 4.79264241815
Average loss at step 66000 : 4.97760552907
Average loss at step 68000 : 4.91262453341
Average loss at step 70000 : 4.75980962324
Nearest to use : beeb, victoriae, dasyprocta, marmoset, callithrix, perfective, velar, fuerteventura,
Nearest to its : their, his, the, her, ssbn, some, metis, agouti,
Nearest to seven : six, eight, four, nine, three, five, zero, one,
Nearest to there : they, it, still, he, we, she, often, handler,
Nearest to by : was, be, in, as, were, from, with, wct,
Nearest to to : would, can, will, imprint, for, vs, nine, bouldering,
Nearest to all : many, some, gland, these, abitibi, subkey, dasyprocta, hands,
Nearest to first : best, last, hora, second, defend, agouti, mcclellan, microsite,
Nearest to history : microcebus, michelob, mitral, wct, saul, tickets, recitative, cebus,
Nearest to at : in, on, prism, cebus, during, agouti, dasyprocta, wct,
Nearest to which : that, this, also, but, it, callithrix, albury, beaches,
Nearest to see : canaris, akita, cubes, ssbn, thaler, referenced, asterism, prism,
Nearest to are : were, is, have, be, including, gland, do, sutra,
Nearest to five : four, six, three, eight, seven, zero, nine, two,
Nearest to often : now, generally, also, wct, sometimes, mats, jen, which,
Nearest to may : can, would, will, must, could, should, hydrothermal, might,
Average loss at step 72000 : 4.80357911384
Average loss at step 74000 : 4.77398807603
Average loss at step 76000 : 4.89073816597
Average loss at step 78000 : 4.80883758235
Average loss at step 80000 : 4.7976956389
Nearest to use : beeb, victoriae, clodius, dasyprocta, marmoset, callithrix, crb, perfective,
Nearest to its : their, his, the, her, ssbn, vec, reginae, agouti,
Nearest to seven : six, eight, five, four, three, nine, zero, callithrix,
Nearest to there : it, they, he, still, we, she, clodius, often,
Nearest to by : was, cegep, be, in, from, with, vec, wct,
Nearest to to : mitsuda, imprint, would, will, bouldering, wct, vs, dasyprocta,
Nearest to all : many, some, these, gland, dasyprocta, subkey, abitibi, two,
Nearest to first : last, best, second, hora, mcclellan, defend, next, agouti,
Nearest to history : microcebus, michelob, wct, mitral, tickets, shaman, saul, alphorn,
Nearest to at : in, on, cegep, during, prism, cebus, dasyprocta, wct,
Nearest to which : that, this, also, it, but, callithrix, albury, cebus,
Nearest to see : canaris, akita, cubes, ssbn, prism, thaler, asterism, lobbied,
Nearest to are : were, is, have, including, do, be, gland, while,
Nearest to five : six, four, seven, eight, three, nine, zero, two,
Nearest to often : now, generally, also, sometimes, wct, usually, commonly, which,
Nearest to may : can, would, will, could, must, should, might, crb,
Average loss at step 82000 : 4.80440880048
Average loss at step 84000 : 4.79721266794
Average loss at step 86000 : 4.75776743543
Average loss at step 88000 : 4.69816068482
Average loss at step 90000 : 4.75949970603
Nearest to use : beeb, victoriae, dasyprocta, marmoset, clodius, callithrix, crb, perfective,
Nearest to its : their, his, the, her, ssbn, some, reginae, fath,
Nearest to seven : six, eight, five, four, nine, three, zero, agouti,
Nearest to there : they, it, he, still, she, we, often, clodius,
Nearest to by : cegep, was, be, when, vec, as, through, wct,
Nearest to to : would, imprint, mitsuda, will, nine, can, albury, wct,
Nearest to all : many, some, these, gland, dasyprocta, peacocks, abitibi, several,
Nearest to first : best, last, second, hora, agouti, next, microsite, mcclellan,
Nearest to history : microcebus, michelob, wct, mitral, tickets, shaman, dipyramid, saul,
Nearest to at : in, on, during, under, cegep, cebus, prism, wct,
Nearest to which : that, this, also, but, it, callithrix, albury, both,
Nearest to see : akita, canaris, yyyy, ssbn, cubes, but, asterism, thaler,
Nearest to are : were, is, have, be, including, do, gland, while,
Nearest to five : seven, six, four, eight, three, nine, zero, two,
Nearest to often : now, generally, sometimes, also, usually, commonly, wct, mats,
Nearest to may : can, would, will, could, must, should, might, crb,
Average loss at step 92000 : 4.6982008096
Average loss at step 94000 : 4.61954891503
Average loss at step 96000 : 4.74444079077
Average loss at step 98000 : 4.61056392497
Average loss at step 100000 : 4.67929824364
Nearest to use : beeb, victoriae, dasyprocta, callithrix, marmoset, clodius, crb, perfective,
Nearest to its : their, his, the, her, ssbn, some, fath, reginae,
Nearest to seven : six, eight, five, four, nine, three, zero, two,
Nearest to there : they, it, he, still, we, she, however, often,
Nearest to by : be, cegep, was, after, were, as, vec, when,
Nearest to to : would, will, can, nine, imprint, wct, albury, bouldering,
Nearest to all : many, some, these, several, dasyprocta, gland, peacocks, abitibi,
Nearest to first : last, second, best, hora, next, original, mcclellan, microsite,
Nearest to history : microcebus, wct, mitral, michelob, tickets, shaman, cebus, saul,
Nearest to at : in, on, during, cegep, under, cebus, agni, charcot,
Nearest to which : that, this, also, but, it, callithrix, these, one,
Nearest to see : akita, cubes, canaris, yyyy, list, thaler, ssbn, but,
Nearest to are : were, is, have, be, do, including, while, include,
Nearest to five : four, seven, six, eight, three, zero, nine, two,
Nearest to often : now, generally, sometimes, usually, commonly, also, wct, still,
Nearest to may : can, would, will, could, must, should, might, cannot,

下面定义一个用来可视化 Word2Vec 效果的函数。这里 low_dim_embs 是降维到 2 维 的单词的空间向量,我们将在图表中展示每个单词的位置。我么使用 plt.scatter 显示散点图(单词的位置),并用 plt.annotate 展示单词本身,同时,使用 plt.savefig 保存图片到本地文件。

In [ ]:
def plot_with_labels(low_dim_embs,labels,filename='tsne.png'):
    assert low_dim_embs.shape[0]>=len(labels),'More labels then embeddings'
    plt.figure(figsize=(18,18))
    for i,label in enumerate(labels):
        x,y=low_dim_embs[i,:]
        plt.scatter(x,y)
        plt.annotate(label,xy=(x,y),xytext=(5,2),textcoords='offset points',ha='right',va='bottom')
    plt.savefig(filename)

我们使用 sklearn.manifold.TSNE 实现降维,这里直接将原始的 128 维的嵌入向量降到 2 维,再用前面的 plot_with_labels 函数进行展示。这里只展示词频最高的 100 个单词的可视化结果。

In [ ]:
from sklearn.manifold import TSNE
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
tsne=TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
plot_only=100
low_dim_embs=tsne.fit_transform(final_embeddings[:plot_only,:])
labels=[reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,labels)