I've made simple prediction model with keras and bag of words based on the code which I found in the tutorials. Loading dataset and training finished without problem and accuracy is around 88%.
Dataset has two columns text and tag (i.e. "some text, a"). How can I test trained model with some other data which is not in dataset like model.predict(some text)?
This is sample dataset: tekst,tag Sconto,n Trg Vinodolskog zakona 5,a I wish to save the model so I don't have to train it every time I run the script. Is correct way to put at the end of the script "model.save('my_model.h5')"? How can I the load the model and make prediction with data that it's not in dataset?
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
df = pd.read_csv('dataset3.csv')
df = df[pd.notnull(df['tag'])]
df.head(10)
def print_plot(index):
example = df[df.index == index][['tekst', 'tag']].values[0]
if len(example) > 0:
print(example[0])
print('Tag:', example[1])
print_plot(0)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['tekst'] = df['tekst'].apply(clean_text)
print_plot(0)
import itertools
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
train_size = int(len(df) * .7)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))
train_posts = df['tekst'][:train_size]
train_tags = df['tag'][:train_size]
test_posts = df['tekst'][train_size:]
test_tags = df['tag'][train_size:]
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
batch_size = 32
epochs = 2
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_split=0.1)
score = model.evaluate(x_test, y_test,
batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])