0 votes
1 view
in Data Science by (17.6k points)

I am new at working with word2vec. I need to fine tune my word2vec model.

I have 2 datasets: data1 and data2 what i did so far is :

model = gensim.models.Word2Vec(

        data1,

        size=size_v,

        window=size_w,

        min_count=min_c,

        workers=work)

model.train(data1, total_examples=len(data1), epochs=epochs)

model.train(data2, total_examples=len(data2), epochs=epochs)

Is this correct? Do I need to store learned weights somewhere?

I checked  this answer  and this one  but I couldn't understand how it's done.

Can someone explain to me the steps to follow?

Thank you in advance

1 Answer

0 votes
by (38.2k points)

Here, if you are provided with data1 at the time of model instantiation, then there is no need to call train() with data1. Using the default number of epochs model would have already done its own internal build_vocab() and train() on the supplied corpus.

Fine-tuning is a complex process that requires some reliable and assertive steps in order to improve the model.

Here, your vocab should be consistent if you are fine tuning an already existing w2v.

Now, refer to the code below:

.

import os

import pickle

import numpy as np

import gensim

from gensim.models import Word2Vec, KeyedVectors

from gensim.models.callbacks import CallbackAny2Vec

import operator

os.mkdir("model_dir")

# class EpochSaver(CallbackAny2Vec):

#     '''Callback to save model after each epoch.'''

#     def __init__(self, path_prefix):

#         self.path_prefix = path_prefix

#         self.epoch = 0

#     def on_epoch_end(self, model):

#         list_of_existing_files = os.listdir(".")

#         output_path = 'model_dir/{}_epoch{}.model'.format(self.path_prefix, self.epoch)

#         try:

#             model.save(output_path)

#         except:

#             model.wv.save_word2vec_format('model_dir/model_{}.bin'.format(self.epoch), binary=True)

#         print("number of epochs completed = {}".format(self.epoch))

#         self.epoch += 1

#         list_of_total_files = os.listdir(".")

# saver = EpochSaver("my_finetuned")

# function to load vectors from existing model.

# I am loading glove vectors from a text file, benefit of doing this is that I get complete vocab of glove as well.

# If you are using a previous word2vec model I would recommed save that in txt format.

# In case you decide not to do it, you can tweak the function to get vectors for words in your vocab only.

def load_vectors(token2id, path,  limit=None):

    embed_shape = (len(token2id), 300)

    freqs = np.zeros((len(token2id)), dtype='f')

    vectors = np.zeros(embed_shape, dtype='f')

    i = 0

    with open(path, encoding="utf8", errors='ignore') as f:

        for o in f:

            token, *vector = o.split(' ')

            token = str.lower(token)

            if len(o) <= 100:

                continue

            if limit is not None and i > limit:

                break

            vectors[token2id[token]] = np.array(vector, 'f')

            i += 1

    return vectors

embedding_name = "glove.840B.300d.txt"

data = "<training data(new line separated tect file)>"

# Dictionary to store a unique id for each token in vocab( in my case vocab contains both my vocab and glove vocab)

token2id = {}

# This dictionary will contain all the words and their frequencies.

vocab_freq_dict = {}

# Populating vocab_freq_dict and token2id from my data.

id_ = 0

training_examples = []

file = open("{}".format(data),'r', encoding="utf-8")

for line in file.readlines():

    words = line.strip().split(" ")

    training_examples.append(words)

    for word in words:

        if word not in vocab_freq_dict:

            vocab_freq_dict.update({word:0})

        vocab_freq_dict[word] += 1

        if word not in token2id:

            token2id.update({word:id_})

            id_ += 1

# Populating vocab_freq_dict and token2id from glove vocab.

max_id = max(token2id.items(), key=operator.itemgetter(1))[0]

max_token_id = token2id[max_id]

with open(embedding_name, encoding="utf8", errors='ignore') as f:

    for o in f:

        token, *vector = o.split(' ')

        token = str.lower(token)

        if len(o) <= 100:

            continue

        if token not in token2id:

            max_token_id += 1

            token2id.update({token:max_token_id})

            vocab_freq_dict.update({token:1})

with open("vocab_freq_dict","wb") as vocab_file:

    pickle.dump(vocab_freq_dict, vocab_file)

with open("token2id", "wb") as token2id_file:

    pickle.dump(token2id, token2id_file)

# converting vectors to keyedvectors format for gensim

vectors = load_vectors(token2id, embedding_name)

vec = KeyedVectors(300)

vec.add(list(token2id.keys()), vectors, replace=True)

# setting vectors(numpy_array) to None to release memory

vectors = None

params = dict(min_count=1,workers=14,iter=6,size=300)

model = Word2Vec(**params)

# using build from vocab to build the vocab

model.build_vocab_from_freq(vocab_freq_dict)

# using token2id to create idxmap

idxmap = np.array([token2id[w] for w in model.wv.index2entity])

# Setting hidden weights(syn0 = between input layer and hidden layer) = your vectors arranged accoring to ids

model.wv.vectors[:] = vec.vectors[idxmap]

# Setting hidden weights(syn0 = between hidden layer and output layer) = your vectors arranged accoring to ids

model.trainables.syn1neg[:] = vec.vectors[idxmap]

model.train(training_examples, total_examples=len(training_examples), epochs=model.epochs)

output_path = 'model_dir/final_model.model'

model.save(output_path)

If you wish to learn more about how to use python for data science, then go through data science python programming course by Intellipaat for more insights.

Welcome to Intellipaat Community. Get your technical queries answered by top developers !


Categories

...