hello,
I am writing a paper for word Embeddings effect on machine learning algorithm
my code is almost same this with little different https://github.com/iamaziz/ar-embeddings/blob/master/asa.py
I already use 2 type of word Embeddings for other authors with extension .bin
however, when I use "full_grams_sg_300_twitter.mdl" I have this error:
UnicodeDecodeError Traceback (most recent call last)
in ()
5 dataset_path = "Sport_TrainingSet_1000_01.csv"
6 # run
----> 7 ArSentiment(embeddings_path, dataset_path, plot_roc=True)
8
in init(self, embeddings_file, dataset_file, plot_roc, split, detailed)
12 self.split = split
13
---> 14 self.embeddings, self.dimension = self.load_vectors(embeddings_file)
15
16 # read dataset
in load_vectors(model_name, binary)
66 """load the pre-trained embedding model"""
67 if binary:
---> 68 w2v_model = KeyedVectors.load_word2vec_format(model_name, binary=True)
69 else:
70 w2v_model = gensim.models.Word2Vec.load(model_name)
/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
1436 return _load_word2vec_format(
1437 cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
-> 1438 limit=limit, datatype=datatype)
1439
1440 def get_keras_embedding(self, train_embeddings=False):
/anaconda3/lib/python3.7/site-packages/gensim/models/utils_any2vec.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
170 logger.info("loading projection weights from %s", fname)
171 with utils.smart_open(fname) as fin:
--> 172 header = utils.to_unicode(fin.readline(), encoding=encoding)
173 vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
174 if limit:
/anaconda3/lib/python3.7/site-packages/gensim/utils.py in any2unicode(text, encoding, errors)
353 if isinstance(text, unicode):
354 return text
--> 355 return unicode(text, encoding, errors=errors)
356
357
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte