Skip to content

Commit

Permalink
add all relevant files
Browse files Browse the repository at this point in the history
  • Loading branch information
janrth committed Dec 16, 2022
0 parents commit b8a7999
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 0 deletions.
74 changes: 74 additions & 0 deletions chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import random
import json
import pickle
import torch

from linear_nn import NeuralNet
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.treebank import TreebankWordDetokenizer
#############################

# activate stemmer and device for pytroch:
stemmer = PorterStemmer()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load json file with training data:
json_file_path = "intents.json"
with open(json_file_path, 'r') as j:
contents = json.loads(j.read())

FILE = "data.pth"
data = torch.load(FILE)

# define words to ignore when tokenize and stemming:
ignore_words = ['?', '.', '!', ',']

# Find input size, size of hidden layers and output size + tags:
input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
tags = data['tags']
model_state = data["model_state"]

# Instantiate pre-trained pytorch model:
model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

# Load fitted vectorization for encoding:
vectorization_encoder = pickle.load(open('vectorization_encoder', 'rb'))

# Define bot name:
bot_name = "Yara"

# Start chat:
print("Let's chat! (type 'quit' to exit)")
while True:
sentence = input("You: ")
if sentence == "quit":
break
# stem + tokenize for input, before detokenization
sentence = TreebankWordDetokenizer().detokenize([stemmer.stem(word) for word in
nltk.word_tokenize(sentence) if word not in ignore_words])
sentence = [sentence] # input sentence needs to be a list for vectorization
X = vectorization_encoder.transform(sentence) # apply pre-fitted vectorization
X = torch.from_numpy(X.toarray()).to(device) # from numpy to torch

output = model(X.float()) # make predict
_, predicted = torch.max(output, dim=1) # return prediction with highest cross-entropy

tag = tags[predicted.item()] # find correct tag

probs = torch.softmax(output, dim=1) # find cross-entropy from softmax layer
prob = probs[0][predicted.item()]

# If cross-entropy prob is big enough (so the bot is confident enough), then return an answer,
# otherwise reply to repeat the question:
if prob.item() > 0.6:
for intent in contents['intents']:
if tag == intent["tag"]:
print(f"{bot_name}: {random.choice(intent['responses'])}")
else:
print(f"{bot_name}: I do not understand, could you rephrase your answer, please. If we can not help you here, please visit our FAQ page at -insert_your_link_here-")
16 changes: 16 additions & 0 deletions data_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from torch.utils.data import Dataset, DataLoader

class ChatDataset(Dataset):

def __init__(self, X_train, y_train):
self.n_samples = len(X_train)
self.x_data = X_train
self.y_data = y_train

# support indexing such that dataset[i] can be used to get i-th sample
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]

# we can call len(dataset) to return the size
def __len__(self):
return self.n_samples
45 changes: 45 additions & 0 deletions intents.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{"intents": [
{"tag": "greeting",
"patterns": ["Hi","Hello", "How are you", "Is anyone there?", "Hello", "Good day"],
"responses": ["Hello", "Hi", "Hi there, how can I help?", "Hi, what can I do for you?"],
"context_set": ""
},
{"tag": "goodbye",
"patterns": ["Bye", "See you later", "Goodbye", "Ciao"],
"responses": ["See you later, thanks for visiting", "Have a nice day", "Bye! Come back again soon."]
},
{"tag": "thanks",
"patterns": ["Thanks", "Thank you", "That's helpful"],
"responses": ["Happy to help!", "Any time!", "My pleasure"]
},
{"tag": "payments",
"patterns": ["Do you take credit cards?", "Do you accept Mastercard?", "Are you cash only?",
"Do you take Debit card?", "Can I pay with Paypal?"],
"responses": ["We accept VISA, Mastercard, AMEX and also Paypal."]
},
{"tag": "covid",
"patterns": ["Do we need to wear a mask?", "What are your current Covid rules?", "What are your corona restrictions?", "Are all your guest vaccinated"],
"responses": ["We follow the country-wide Covid rules. You don't need to wear a face mask at the moment, but this might change during winter times."]
},
{"tag": "check-in",
"patterns": ["When is check-in?", "At what time are the rooms ready?", "When can we arrive?", "Is an early check-in possible?"],
"responses": ["Check-in is possible from 12am on. You can arrive earlier and leave your luggage at the front desk."]
},
{"tag": "check-out",
"patterns": ["When is check-out?", "At what time do we need to leave?", "Is a late check-out possible?"],
"responses": ["Check-out is at 10am latest. If you are interested in late check-out, please type 'late check-out' next."]
},
{"tag": "late check-out",
"patterns": ["Late check-out"],
"responses": ["Late check-out is an option, but only if the room is not booked for the current day. We charge 10 Dollar for every additional hour."]
},
{"tag": "breakfast",
"patterns": ["What types of breakfast di you offer?", "What is for breakfast?", "Do you have vegan options for breakfast?", "Are there any vegetarian options for breakfast?", "Do you have fresh fruits for breakfast?"],
"responses": ["We offer a continental breakfast with multiple vegetarian and vegan options. Additionally we serve fresh fruits and juices every morning."]
},
{"tag": "parking",
"patterns": ["Where can we park our car?", "Do you have a parking lot?", "Where do I park my car?","Do you charge for parking?", "Is parking free?"],
"responses": ["We have a big parking lot with enough space and as our guest you can park for free, of course."]
}
]
}
19 changes: 19 additions & 0 deletions linear_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import torch
import torch.nn as nn

class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NeuralNet, self).__init__()
self.l1 = nn.Linear(input_size, hidden_size)
self.l2 = nn.Linear(hidden_size, hidden_size)
self.l3 = nn.Linear(hidden_size, num_classes)
self.relu = nn.ReLU()

def forward(self, x):
out = self.l1(x)
out = self.relu(out)
out = self.l2(out)
out = self.relu(out)
out = self.l3(out)
# no activation and no softmax at the end
return out
116 changes: 116 additions & 0 deletions train_chatbot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import numpy as np
import nltk
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from torch.utils.data import Dataset, DataLoader
import json
import pickle

import data_set
import linear_nn

#nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
##########################

def detokenize_fct(x):
'''
Applies detokanization to obtain full sentences.
'''
return TreebankWordDetokenizer().detokenize(x)

# Instantiate stemmer and count-vectorizer:
stemmer = PorterStemmer()
vectorizer = CountVectorizer()

# load json file with training data:
json_file_path = "intents.json"
with open(json_file_path, 'r') as j:
contents = json.loads(j.read())


# Training begins here
# Create empty lists to save tags and full sentences + define signs to be ignored
# for stemming and tokenization:
tags = []
sentences = []

# loop through each sentence in training data:
for intent in contents['intents']:
tag = intent['tag']
for pattern in intent['patterns']:
# tokenization and stemming applied to each word, before detokenized sentence is saved
sentences.append(detokenize_fct([stemmer.stem(word) for word in nltk.word_tokenize(pattern)]))
tags.append(tag)

# The detokanization is needed as the vectorizer is applied on one list of many sentences:
X_train = vectorizer.fit_transform(sentences)

# Create unique tags and numeric class labels:
tags_unique = sorted(set(tags))
y_train = [tags_unique.index(i) for i in tags]

# save vectorizer:
pickle.dump(vectorizer, open('vectorization_encoder', 'wb'))

# Train model:
X_train = X_train.toarray()
y_train = np.array(y_train)

# Hyper-parameters for pytroch NeuralNet:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 500
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)

# Create dataset to be consumed by the model + instantiate the NeuralNetwork:
dataset = data_set.ChatDataset(X_train, y_train)
model = linear_nn.NeuralNet(input_size, hidden_size, output_size).to(device)

train_loader = DataLoader(dataset=dataset,
batch_size=batch_size,
shuffle=True,
num_workers=2)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
for (words, labels) in train_loader:
words = words.to(device)
labels = labels.to(device)

# Forward pass
outputs = model(words.float())
loss = criterion(outputs, labels)

# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()

if (epoch+1) % 100 == 0:
print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print(f'final loss: {loss.item():.4f}')

# Save the model:
data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"tags": tags_unique
}

FILE = "data.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

0 comments on commit b8a7999

Please sign in to comment.