python - Is there a mechanism to load the complete Google Vectors in less memory and in optimized way? -


here code dealing with:

import numpy np import theano import cpickle collections import defaultdict import sys, re import pandas pd import csv import getpass   def build_data_cv(datafile, cv=10, clean_string=true):     """     loads data , split 10 folds.     """     revs = []     vocab = defaultdict(float)      open(datafile, "rb") csvf:         csvreader=csv.reader(csvf,delimiter=',',quotechar='"')         first_line=true         line in csvreader:             if first_line:                 first_line=false                 continue             status=[]             sentences=re.split(r'[.?]', line[1].strip())             try:                 sentences.remove('')             except valueerror:                 none              sent in sentences:                 if clean_string:                     orig_rev = clean_str(sent.strip())                     if orig_rev=='':                             continue                     words = set(orig_rev.split())                     splitted = orig_rev.split()                     if len(splitted)>150:                         orig_rev=[]                         splits=int(np.floor(len(splitted)/20))                         index in range(splits):                             orig_rev.append(' '.join(splitted[index*20:(index+1)*20]))                         if len(splitted)>splits*20:                             orig_rev.append(' '.join(splitted[splits*20:]))                         status.extend(orig_rev)                     else:                         status.append(orig_rev)                 else:                     orig_rev = sent.strip().lower()                     words = set(orig_rev.split())                     status.append(orig_rev)                  word in words:                     vocab[word] += 1               datum  = {"y0":1 if line[2].lower()=='y' else 0,                   "y1":1 if line[3].lower()=='y' else 0,                   "y2":1 if line[4].lower()=='y' else 0,                   "y3":1 if line[5].lower()=='y' else 0,                   "y4":1 if line[6].lower()=='y' else 0,                   "text": status,                   "user": line[0],                   "num_words": np.max([len(sent.split()) sent in status]),                   "split": np.random.randint(0,cv)}             revs.append(datum)       return revs, vocab  def get_w(word_vecs, k=300):     """     word matrix. w[i] vector word indexed     """     vocab_size = len(word_vecs)     word_idx_map = dict()     w = np.zeros(shape=(vocab_size+1, k), dtype=theano.config.floatx)     w[0] = np.zeros(k, dtype=theano.config.floatx)     = 1     word in word_vecs:         w[i] = word_vecs[word]         word_idx_map[word] =         += 1     return w, word_idx_map  def load_bin_vec(fname, vocab):     """     loads 300x1 word vecs google (mikolov) word2vec     """     word_vecs = {}     open(fname, "rb") f:         header = f.readline()         vocab_size, layer1_size = map(int, header.split())         binary_len = np.dtype(theano.config.floatx).itemsize * layer1_size         line in xrange(vocab_size):             word = []             while true:                 ch = f.read(1)                 if ch == ' ':                     word = ''.join(word)                     break                 if ch != '\n':                     word.append(ch)             if word in vocab:                word_vecs[word] = np.fromstring(f.read(binary_len), dtype=theano.config.floatx)             else:                 f.read(binary_len)     return word_vecs  def add_unknown_words(word_vecs, vocab, min_df=1, k=300):     """     words occur in @ least min_df documents, create separate word vector.     0.25 chosen unknown vectors have (approximately) same variance pre-trained ones     """     word in vocab:         if word not in word_vecs , vocab[word] >= min_df:             word_vecs[word] = np.random.uniform(-0.25,0.25,k)             print word  def clean_str(string, trec=false):     """     tokenization/string cleaning datasets except sst.     every dataset lower cased except trec     """     string = re.sub(r"[^a-za-z0-9(),!?\'\`]", " ", string)     string = re.sub(r"\'s", " \'s ", string)     string = re.sub(r"\'ve", " have ", string)     string = re.sub(r"n\'t", " not ", string)     string = re.sub(r"\'re", " ", string)     string = re.sub(r"\'d" , " ", string)     string = re.sub(r"\'ll", " ", string)     string = re.sub(r",", " , ", string)     string = re.sub(r"!", " ! ", string)     string = re.sub(r"\(", " ( ", string)     string = re.sub(r"\)", " ) ", string)     string = re.sub(r"\?", " \? ", string) #    string = re.sub(r"[a-za-z]{4,}", "", string)     string = re.sub(r"\s{2,}", " ", string)     return string.strip() if trec else string.strip().lower()  def clean_str_sst(string):     """     tokenization/string cleaning sst dataset     """     string = re.sub(r"[^a-za-z0-9(),!?\'\`]", " ", string)     string = re.sub(r"\s{2,}", " ", string)     return string.strip().lower()  def get_mairesse_features(file_name):     feats={}     open(file_name, "rb") csvf:         csvreader=csv.reader(csvf,delimiter=',',quotechar='"')         line in csvreader:             feats[line[0]]=[float(f) f in line[1:]]     return feats  if __name__=="__main__":     w2v_file = sys.argv[1]     data_folder = sys.argv[2]     mairesse_file = sys.argv[3]     print "loading data...",     revs, vocab = build_data_cv(data_folder, cv=10, clean_string=true)     num_words=pd.dataframe(revs)["num_words"]      max_l = np.max(num_words)     print "data loaded!"      print "number of status: " + str(len(revs))     print "vocab size: " + str(len(vocab))     print "max sentence length: " + str(max_l)     print "loading word2vec vectors...",     w2v = load_bin_vec(w2v_file, vocab)     print "word2vec loaded!"      print "num words in word2vec: " + str(len(w2v))     add_unknown_words(w2v, vocab)     w, word_idx_map = get_w(w2v)     rand_vecs = {}     add_unknown_words(rand_vecs, vocab)     w2, _ = get_w(rand_vecs)     mairesse = get_mairesse_features(mairesse_file)     cpickle.dump([revs, w, w2, word_idx_map, vocab, mairesse], open("essays_mairesse.p", "wb"))     print "dataset created!" 

the process after running getting killed. hence, started debugging find out culprit line f code. usual, thought came right. issue of process getting killed owing use of google news vector file. file not getting loaded , consuming hell lot memory, matter of concern me.

i looking solution can make process chunking google vectors 1 one , still better training result.

but of no use complete data in getting better resultant model of training.

kindly, suggest me can better result in lesser or optimized memory.


Comments

Popular posts from this blog

angular - Ionic slides - dynamically add slides before and after -

Add a dynamic header in angular 2 http provider -

minify - Minimizing css files -