1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
| import re import operator import pandas as pd from tqdm import tqdm from gensim.models import KeyedVectors
tqdm.pandas()
train = pd.read_csv("../input/train.csv") test = pd.read_csv("../input/test.csv") print("Train shape : ",train.shape) print("Test shape : ",test.shape)
def build_vocab(sentences, verbose = True): """ :param sentences: list of list of words :return: dictionary of words and their count """ vocab = {} for sentence in tqdm(sentences, disable = (not verbose)): for word in sentence: try: vocab[word] += 1 except KeyError: vocab[word] = 1 return vocab
sentences = train["question_text"].progress_apply(lambda x: x.split()).values vocab = build_vocab(sentences) print({k: vocab[k] for k in list(vocab)[:5]})
news_path = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin' embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)
def check_coverage(vocab,embeddings_index): a = {} oov = {} k = 0 i = 0 for word in tqdm(vocab): try: a[word] = embeddings_index[word] k += vocab[word] except:
oov[word] = vocab[word] i += vocab[word] pass
print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab))) print('Found embeddings for {:.2%} of all text'.format(k / (k + i))) sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
return sorted_x
oov = check_coverage(vocab,embeddings_index)
oov[:10]
'?' in embeddings_index
'&' in embeddings_index
def clean_text(x): x = str(x) for punct in "/-'": x = x.replace(punct, ' ') for punct in '&': x = x.replace(punct, f' {punct} ') for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’': x = x.replace(punct, '') return x
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_text(x)) sentences = train["question_text"].apply(lambda x: x.split()) vocab = build_vocab(sentences) oov = check_coverage(vocab,embeddings_index)
oov[:10]
for i in range(10): print(embeddings_index.index2entity[i])
def clean_numbers(x): x = re.sub('[0-9]{5,}', '#####', x) x = re.sub('[0-9]{4}', '####', x) x = re.sub('[0-9]{3}', '###', x) x = re.sub('[0-9]{2}', '##', x) return x
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x)) sentences = train["question_text"].progress_apply(lambda x: x.split()) vocab = build_vocab(sentences) oov = check_coverage(vocab,embeddings_index)
oov[:20]
def _get_mispell(mispell_dict): mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys())) return mispell_dict, mispell_re
mispell_dict = {'colour':'color', 'centre':'center', 'didnt':'did not', 'doesnt':'does not', 'isnt':'is not', 'shouldnt':'should not', 'favourite':'favorite', 'travelling':'traveling', 'counselling':'counseling', 'theatre':'theater', 'cancelled':'canceled', 'labour':'labor', 'organisation':'organization', 'wwii':'world war 2', 'citicise':'criticize', 'instagram': 'social medium', 'whatsapp': 'social medium', 'snapchat': 'social medium'
} mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text): def replace(match): return mispellings[match.group(0)]
return mispellings_re.sub(replace, text)
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x)) sentences = train["question_text"].progress_apply(lambda x: x.split()) to_remove = ['a','to','of','and'] sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)] vocab = build_vocab(sentences) oov = check_coverage(vocab,embeddings_index)
oov[:20]
|