import re import string import numpy as np import random import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from plotly import graph_objs as go import plotly.express as px import plotly.figure_factory as ff from collections import Counter
from PIL import Image from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from tqdm import tqdm import os import nltk import spacy import random from spacy.util import compounding from spacy.util import minibatch from collections import defaultdict from collections import Counter import keras from keras.models import Sequential from keras.initializers import Constant from keras.layers import (LSTM, Embedding, BatchNormalization, Dense, TimeDistributed, Dropout, Bidirectional, Flatten, GlobalMaxPool1D) from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.layers.embeddings import Embedding from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau from keras.optimizers import Adam
from sklearn.metrics import ( precision_score, recall_score, f1_score, classification_report, accuracy_score )
target message 0 ham Go until jurong point, crazy.. Available only ... 1 ham Ok lar... Joking wif u oni... 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 3 ham U dun say so early hor... U c already then say... 4 ham Nah I don't think he goes to usf, he lives aro...
target message message_len 0 ham Go until jurong point, crazy.. Available only ... 20 1 ham Ok lar... Joking wif u oni... 6 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 3 ham U dun say so early hor... U c already then say... 11 4 ham Nah I don't think he goes to usf, he lives aro... 13 171
defclean_text(text): '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.''' text = str(text).lower() text = re.sub('\[.*?\]', '', text) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\n', '', text) text = re.sub('\w*\d\w*', '', text) return text
target message message_len message_clean 0 ham Go until jurong point, crazy.. Available only ... 20 go until jurong point crazy available only in ... 1 ham Ok lar... Joking wif u oni... 6 ok lar joking wif u oni 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entry in a wkly comp to win fa cup final... 3 ham U dun say so early hor... U c already then say... 11 u dun say so early hor u c already then say 4 ham Nah I don't think he goes to usf, he lives aro... 13 nah i dont think he goes to usf he lives aroun...
defremove_stopwords(text): text = ' '.join(word for word in text.split(' ') if word notin stop_words) return text df['message_clean'] = df['message_clean'].apply(remove_stopwords) df.head()
输出结果为:
1 2 3 4 5 6
target message message_len message_clean 0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazy available bugis n great ... 1 ham Ok lar... Joking wif u oni... 6 ok lar joking wif oni 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entry wkly comp win fa cup final tkts m... 3 ham U dun say so early hor... U c already then say... 11 dun say early hor already say 4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goes usf lives around though
target message message_len message_clean 0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazi avail bugi n great world... 1 ham Ok lar... Joking wif u oni... 6 ok lar joke wif oni 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entri wkli comp win fa cup final tkts m... 3 ham U dun say so early hor... U c already then say... 11 dun say earli hor alreadi say 4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goe usf live around though
全部
1 2 3 4 5 6 7 8 9 10 11 12
defpreprocess_data(text): # Clean puntuation, urls, and so on text = clean_text(text) # Remove stopwords text = ' '.join(word for word in text.split(' ') if word notin stop_words) # Stemm all the words in the sentence text = ' '.join(stemmer.stem(word) for word in text.split(' ')) return text
target message message_len message_clean 0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazi avail bugi n great world... 1 ham Ok lar... Joking wif u oni... 6 ok lar joke wif oni 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entri wkli comp win fa cup final tkts m... 3 ham U dun say so early hor... U c already then say... 11 dun say ear hor alreadi say 4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goe usf live around though
target message message_len message_clean target_encoded 0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazi avail bugi n great world... 0 1 ham Ok lar... Joking wif u oni... 6 ok lar joke wif oni 0 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entri wkli comp win fa cup final tkts m... 1 3 ham U dun say so early hor... U c already then say... 11 dun say ear hor alreadi say 0 4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goe usf live around though 0
wc = WordCloud( background_color='white', max_words=200, mask=twitter_mask, ) wc.generate(' '.join(text for text in df.loc[df['target'] == 'ham', 'message_clean'])) plt.figure(figsize=(18,10)) plt.title('Top words for HAM messages', fontdict={'size': 22, 'verticalalignment': 'bottom'}) plt.imshow(wc) plt.axis("off") plt.show()
wc = WordCloud( background_color='white', max_words=200, mask=twitter_mask, ) wc.generate(' '.join(text for text in df.loc[df['target'] == 'spam', 'message_clean'])) plt.figure(figsize=(18,10)) plt.title('Top words for HAM messages', fontdict={'size': 22, 'verticalalignment': 'bottom'}) plt.imshow(wc) plt.axis("off") plt.show()
from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split # how to define X and y (from the SMS data) for use with COUNTVECTORIZER x = df['message_clean'] y = df['target_encoded']
print(len(x), len(y))
# Split into train and test sets x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42) print(len(x_train), len(y_train)) print(len(x_test), len(y_test))
# instantiate the vectorizer vect = CountVectorizer() vect.fit(x_train)
# Use the trained to create a document-term matrix from train and test sets x_train_dtm = vect.transform(x_train) x_test_dtm = vect.transform(x_test)
ngram_range:n-gram就是一串连续的n个单词。例如。句子“I am Groot”包含2-grams“I am”和“am Groot”。该句子本身就是一个3元语法。设置参数ngram_range=(a,b),其中a是要包含在特征中的ngram的最小值,b是最大值。默认ngram_range为(1,1)。在最近的一个项目中,我对在线招聘信息进行了建模,我发现将2-gram包含在内可以显著提高模型的预测能力。这很直观。许多职位名称,例如“数据科学家”、“数据工程师”和“数据分析师”都是两个词长。
# Load GloVe 100D embeddings withopen('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt') as fp: for line in fp.readlines(): records = line.split() word = records[0] vector_dimensions = np.asarray(records[1:], dtype='float32') embeddings_dictionary [word] = vector_dimensions
# embeddings_dictionary # Now we will load embedding vectors of those words that appear in the # Glove dictionary. Others will be initialized to 0.
for word, index in word_tokenizer.word_index.items(): embedding_vector = embeddings_dictionary.get(word) if embedding_vector isnotNone: embedding_matrix[index] = embedding_vector embedding_matrix
import tensorflow as tf from tensorflow.keras.layers import Dense, Input from tensorflow.keras.optimizers import Adam from tensorflow.keras.models import Model from tensorflow.keras.callbacks import ModelCheckpoint import transformers from tqdm.notebook import tqdm from tokenizers import BertWordPieceTokenizer import tensorflow as tf from tensorflow.keras.optimizers import Adam from transformers import BertTokenizer from transformers import TFBertModel
try: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) except: strategy = tf.distribute.get_strategy() print('Number of replicas in sync: ', strategy.num_replicas_in_sync)
id text target text_len 0 1 Our Deeds are the Reason of this #earthquake M... 1 13 1 4 Forest fire near La Ronge Sask. Canada 1 7 2 5 All residents asked to 'shelter in place' are ... 1 22 3 6 13,000 people receive #wildfires evacuation or... 1 9 4 7 Just got sent this photo from Ruby #Alaska as ... 1 17
defremove_html(text): html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') return re.sub(html, '', text)
# Special thanks to https://www.kaggle.com/tanulsingh077 for this function defclean_text(text): '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.''' text = str(text).lower() text = re.sub('\[.*?\]', '', text) text = re.sub( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text ) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\n', '', text) text = re.sub('\w*\d\w*', '', text) text = remove_url(text) text = remove_emoji(text) text = remove_html(text) return text
# Test emoji removal remove_emoji("Omg another Earthquake 😔😔")
defpreprocess_data(text): # Clean puntuation, urls, and so on text = clean_text(text) # Remove stopwords and Stemm all the words in the sentence text = ' '.join(stemmer.stem(word) for word in text.split(' ') if word notin stop_words)
id text target text_len text_clean 0 1 Our Deeds are the Reason of this #earthquake M... 1 13 deed reason earthquak may allah forgiv us 1 4 Forest fire near La Ronge Sask. Canada 1 7 forest fire near la rong sask canada 2 5 All residents asked to 'shelter in place' are ... 1 22 resid ask shelter place notifi offic evacu she... 3 6 13,000 people receive #wildfires evacuation or... 1 9 peopl receiv wildfir evacu order california 4 7 Just got sent this photo from Ruby #Alaska as ... 1 17 got sent photo rubi alaska smoke wildfir pour ...
词云
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
defcreate_corpus_df(tweet, target): corpus=[] for x in tweet[tweet['target']==target]['text_clean'].str.split(): for i in x: corpus.append(i) return corpus
corpus_disaster_tweets = create_corpus_df(df, 1)
dic=defaultdict(int) for word in corpus_disaster_tweets: dic[word]+=1 top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] print(top)
wc = WordCloud( background_color='white', max_words=200, mask=twitter_mask, ) wc.generate(' '.join(text for text in df.loc[df['target'] == 1, 'text_clean'])) plt.figure(figsize=(12,6)) plt.title('Top words for Real Disaster tweets', fontdict={'size': 22, 'verticalalignment': 'bottom'}) plt.imshow(wc) plt.axis("off") plt.show()
# Load GloVe 100D embeddings # We are not going to do it here as they were loaded earlier.
# Now we will load embedding vectors of those words that appear in the # Glove dictionary. Others will be initialized to 0. embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in word_tokenizer.word_index.items(): embedding_vector = embeddings_dictionary.get(word) if embedding_vector isnotNone: embedding_matrix[index] = embedding_vector embedding_matrix
# Split data into train and test sets X_train, X_test, y_train, y_test = train_test_split( train_padded_sentences, train_target, test_size=0.25 )