python - Is there a mechanism to load the complete Google Vectors in less memory and in optimized way? -
here code dealing with:
import numpy np import theano import cpickle collections import defaultdict import sys, re import pandas pd import csv import getpass def build_data_cv(datafile, cv=10, clean_string=true): """ loads data , split 10 folds. """ revs = [] vocab = defaultdict(float) open(datafile, "rb") csvf: csvreader=csv.reader(csvf,delimiter=',',quotechar='"') first_line=true line in csvreader: if first_line: first_line=false continue status=[] sentences=re.split(r'[.?]', line[1].strip()) try: sentences.remove('') except valueerror: none sent in sentences: if clean_string: orig_rev = clean_str(sent.strip()) if orig_rev=='': continue words = set(orig_rev.split()) splitted = orig_rev.split() if len(splitted)>150: orig_rev=[] splits=int(np.floor(len(splitted)/20)) index in range(splits): orig_rev.append(' '.join(splitted[index*20:(index+1)*20])) if len(splitted)>splits*20: orig_rev.append(' '.join(splitted[splits*20:])) status.extend(orig_rev) else: status.append(orig_rev) else: orig_rev = sent.strip().lower() words = set(orig_rev.split()) status.append(orig_rev) word in words: vocab[word] += 1 datum = {"y0":1 if line[2].lower()=='y' else 0, "y1":1 if line[3].lower()=='y' else 0, "y2":1 if line[4].lower()=='y' else 0, "y3":1 if line[5].lower()=='y' else 0, "y4":1 if line[6].lower()=='y' else 0, "text": status, "user": line[0], "num_words": np.max([len(sent.split()) sent in status]), "split": np.random.randint(0,cv)} revs.append(datum) return revs, vocab def get_w(word_vecs, k=300): """ word matrix. w[i] vector word indexed """ vocab_size = len(word_vecs) word_idx_map = dict() w = np.zeros(shape=(vocab_size+1, k), dtype=theano.config.floatx) w[0] = np.zeros(k, dtype=theano.config.floatx) = 1 word in word_vecs: w[i] = word_vecs[word] word_idx_map[word] = += 1 return w, word_idx_map def load_bin_vec(fname, vocab): """ loads 300x1 word vecs google (mikolov) word2vec """ word_vecs = {} open(fname, "rb") f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype(theano.config.floatx).itemsize * layer1_size line in xrange(vocab_size): word = [] while true: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) if word in vocab: word_vecs[word] = np.fromstring(f.read(binary_len), dtype=theano.config.floatx) else: f.read(binary_len) return word_vecs def add_unknown_words(word_vecs, vocab, min_df=1, k=300): """ words occur in @ least min_df documents, create separate word vector. 0.25 chosen unknown vectors have (approximately) same variance pre-trained ones """ word in vocab: if word not in word_vecs , vocab[word] >= min_df: word_vecs[word] = np.random.uniform(-0.25,0.25,k) print word def clean_str(string, trec=false): """ tokenization/string cleaning datasets except sst. every dataset lower cased except trec """ string = re.sub(r"[^a-za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s ", string) string = re.sub(r"\'ve", " have ", string) string = re.sub(r"n\'t", " not ", string) string = re.sub(r"\'re", " ", string) string = re.sub(r"\'d" , " ", string) string = re.sub(r"\'ll", " ", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " ( ", string) string = re.sub(r"\)", " ) ", string) string = re.sub(r"\?", " \? ", string) # string = re.sub(r"[a-za-z]{4,}", "", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() if trec else string.strip().lower() def clean_str_sst(string): """ tokenization/string cleaning sst dataset """ string = re.sub(r"[^a-za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def get_mairesse_features(file_name): feats={} open(file_name, "rb") csvf: csvreader=csv.reader(csvf,delimiter=',',quotechar='"') line in csvreader: feats[line[0]]=[float(f) f in line[1:]] return feats if __name__=="__main__": w2v_file = sys.argv[1] data_folder = sys.argv[2] mairesse_file = sys.argv[3] print "loading data...", revs, vocab = build_data_cv(data_folder, cv=10, clean_string=true) num_words=pd.dataframe(revs)["num_words"] max_l = np.max(num_words) print "data loaded!" print "number of status: " + str(len(revs)) print "vocab size: " + str(len(vocab)) print "max sentence length: " + str(max_l) print "loading word2vec vectors...", w2v = load_bin_vec(w2v_file, vocab) print "word2vec loaded!" print "num words in word2vec: " + str(len(w2v)) add_unknown_words(w2v, vocab) w, word_idx_map = get_w(w2v) rand_vecs = {} add_unknown_words(rand_vecs, vocab) w2, _ = get_w(rand_vecs) mairesse = get_mairesse_features(mairesse_file) cpickle.dump([revs, w, w2, word_idx_map, vocab, mairesse], open("essays_mairesse.p", "wb")) print "dataset created!"
the process after running getting killed. hence, started debugging find out culprit line f code. usual, thought came right. issue of process getting killed owing use of google news vector file. file not getting loaded , consuming hell lot memory, matter of concern me.
i looking solution can make process chunking google vectors 1 one , still better training result.
but of no use complete data in getting better resultant model of training.
kindly, suggest me can better result in lesser or optimized memory.
Comments
Post a Comment