add all relevant files

janrth · Dec 16, 2022 · b8a7999 · b8a7999
commit b8a7999
Show file tree

Hide file tree

Showing 5 changed files with 270 additions and 0 deletions.
diff --git a/chat.py b/chat.py
@@ -0,0 +1,74 @@
+import random
+import json
+import pickle
+import torch
+
+from linear_nn import NeuralNet
+from sklearn.feature_extraction.text import CountVectorizer
+import nltk
+from nltk.stem.porter import PorterStemmer
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+#############################
+
+# activate stemmer and device for pytroch:
+stemmer = PorterStemmer()
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# load json file with training data:
+json_file_path = "intents.json"
+with open(json_file_path, 'r') as j:
+     contents = json.loads(j.read())
+
+FILE = "data.pth"
+data = torch.load(FILE)
+
+# define words to ignore when tokenize and stemming:
+ignore_words = ['?', '.', '!', ',']
+
+# Find input size, size of hidden layers and output size + tags:
+input_size = data["input_size"]
+hidden_size = data["hidden_size"]
+output_size = data["output_size"]
+tags = data['tags']
+model_state = data["model_state"]
+
+# Instantiate pre-trained pytorch model:
+model = NeuralNet(input_size, hidden_size, output_size).to(device)
+model.load_state_dict(model_state)
+model.eval()
+
+# Load fitted vectorization for encoding:
+vectorization_encoder = pickle.load(open('vectorization_encoder', 'rb'))
+
+# Define bot name:
+bot_name = "Yara"
+
+# Start chat:
+print("Let's chat! (type 'quit' to exit)")
+while True:
+    sentence = input("You: ")
+    if sentence == "quit":
+        break
+    # stem + tokenize for input, before detokenization
+    sentence = TreebankWordDetokenizer().detokenize([stemmer.stem(word) for word in 
+                                                nltk.word_tokenize(sentence) if word not in ignore_words])
+    sentence = [sentence] # input sentence needs to be a list for vectorization
+    X = vectorization_encoder.transform(sentence) # apply pre-fitted vectorization
+    X = torch.from_numpy(X.toarray()).to(device) # from numpy to torch 
+
+    output = model(X.float()) # make predict
+    _, predicted = torch.max(output, dim=1) # return prediction with highest cross-entropy
+
+    tag = tags[predicted.item()] # find correct tag
+
+    probs = torch.softmax(output, dim=1) # find cross-entropy from softmax layer
+    prob = probs[0][predicted.item()]  
+
+    # If cross-entropy prob is big enough (so the bot is confident enough), then return an answer,
+    # otherwise reply to repeat the question:
+    if prob.item() > 0.6:
+        for intent in contents['intents']:
+            if tag == intent["tag"]:
+                print(f"{bot_name}: {random.choice(intent['responses'])}")
+    else:
+        print(f"{bot_name}: I do not understand, could you rephrase your answer, please. If we can not help you here, please visit our FAQ page at -insert_your_link_here-")
diff --git a/data_set.py b/data_set.py
@@ -0,0 +1,16 @@
+from torch.utils.data import Dataset, DataLoader
+
+class ChatDataset(Dataset):
+
+    def __init__(self, X_train, y_train):
+        self.n_samples = len(X_train)
+        self.x_data = X_train
+        self.y_data = y_train
+
+    # support indexing such that dataset[i] can be used to get i-th sample
+    def __getitem__(self, index):
+        return self.x_data[index], self.y_data[index]
+
+    # we can call len(dataset) to return the size
+    def __len__(self):
+        return self.n_samples
diff --git a/intents.json b/intents.json
@@ -0,0 +1,45 @@
+{"intents": [
+        {"tag": "greeting",
+         "patterns": ["Hi","Hello", "How are you", "Is anyone there?", "Hello", "Good day"],
+         "responses": ["Hello", "Hi", "Hi there, how can I help?", "Hi, what can I do for you?"],
+         "context_set": ""
+        },
+        {"tag": "goodbye",
+         "patterns": ["Bye", "See you later", "Goodbye", "Ciao"],
+         "responses": ["See you later, thanks for visiting", "Have a nice day", "Bye! Come back again soon."]
+        },
+        {"tag": "thanks",
+         "patterns": ["Thanks", "Thank you", "That's helpful"],
+         "responses": ["Happy to help!", "Any time!", "My pleasure"]
+        },
+        {"tag": "payments",
+         "patterns": ["Do you take credit cards?", "Do you accept Mastercard?", "Are you cash only?",
+                      "Do you take Debit card?", "Can I pay with Paypal?"],
+         "responses": ["We accept VISA, Mastercard, AMEX and also Paypal."]
+        },
+        {"tag": "covid",
+         "patterns": ["Do we need to wear a mask?", "What are your current Covid rules?", "What are your corona restrictions?", "Are all your guest vaccinated"],
+         "responses": ["We follow the country-wide Covid rules. You don't need to wear a face mask at the moment, but this might change during winter times."]
+        },
+       {"tag": "check-in",
+         "patterns": ["When is check-in?", "At what time are the rooms ready?", "When can we arrive?", "Is an early check-in possible?"],
+         "responses": ["Check-in is possible from 12am on. You can arrive earlier and leave your luggage at the front desk."]
+      },
+      {"tag": "check-out",
+         "patterns": ["When is check-out?", "At what time do we need to leave?", "Is a late check-out possible?"],
+         "responses": ["Check-out is at 10am latest. If you are interested in late check-out, please type 'late check-out' next."]
+      },
+        {"tag": "late check-out",
+         "patterns": ["Late check-out"],
+         "responses": ["Late check-out is an option, but only if the room is not booked for the current day. We charge 10 Dollar for every additional hour."]
+      },
+      {"tag": "breakfast",
+         "patterns": ["What types of breakfast di you offer?", "What is for breakfast?", "Do you have vegan options for breakfast?", "Are there any vegetarian options for breakfast?", "Do you have fresh fruits for breakfast?"],
+         "responses": ["We offer a continental breakfast with multiple vegetarian and vegan options. Additionally we serve fresh fruits and juices every morning."]
+      },
+      {"tag": "parking",
+         "patterns": ["Where can we park our car?", "Do you have a parking lot?", "Where do I park my car?","Do you charge for parking?", "Is parking free?"],
+         "responses": ["We have a big parking lot with enough space and as our guest you can park for free, of course."]
+      }
+   ]
+}
diff --git a/linear_nn.py b/linear_nn.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn as nn
+
+class NeuralNet(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super(NeuralNet, self).__init__()
+        self.l1 = nn.Linear(input_size, hidden_size) 
+        self.l2 = nn.Linear(hidden_size, hidden_size) 
+        self.l3 = nn.Linear(hidden_size, num_classes)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out = self.l1(x)
+        out = self.relu(out)
+        out = self.l2(out)
+        out = self.relu(out)
+        out = self.l3(out)
+        # no activation and no softmax at the end
+        return out
diff --git a/train_chatbot.py b/train_chatbot.py
@@ -0,0 +1,116 @@
+import numpy as np
+import nltk 
+import torch
+import torch.nn as nn
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+from torch.utils.data import Dataset, DataLoader
+import json
+import pickle
+
+import data_set
+import linear_nn
+
+#nltk.download('punkt')
+from nltk.stem.porter import PorterStemmer
+##########################
+
+def detokenize_fct(x):
+    '''
+    Applies detokanization to obtain full sentences.
+    '''
+    return TreebankWordDetokenizer().detokenize(x)
+
+# Instantiate stemmer and count-vectorizer:
+stemmer = PorterStemmer()
+vectorizer = CountVectorizer()
+
+# load json file with training data:
+json_file_path = "intents.json"
+with open(json_file_path, 'r') as j:
+     contents = json.loads(j.read())
+
+
+# Training begins here
+# Create empty lists to save tags and full sentences + define signs to be ignored 
+# for stemming and tokenization:
+tags = []
+sentences = []
+
+# loop through each sentence in training data:
+for intent in contents['intents']:
+    tag = intent['tag']
+    for pattern in intent['patterns']:
+        # tokenization and stemming applied to each word, before detokenized sentence is saved
+        sentences.append(detokenize_fct([stemmer.stem(word) for word in nltk.word_tokenize(pattern)]))
+        tags.append(tag)
+
+# The detokanization is needed as the vectorizer is applied on one list of many sentences:
+X_train = vectorizer.fit_transform(sentences)
+
+# Create unique tags and numeric class labels: 
+tags_unique = sorted(set(tags)) 
+y_train = [tags_unique.index(i) for i in tags] 
+
+# save vectorizer:
+pickle.dump(vectorizer, open('vectorization_encoder', 'wb'))
+
+# Train model:
+X_train = X_train.toarray()
+y_train = np.array(y_train)
+
+# Hyper-parameters for pytroch NeuralNet: 
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+num_epochs = 500
+batch_size = 8
+learning_rate = 0.001
+input_size = len(X_train[0])
+hidden_size = 8
+output_size = len(tags)
+
+# Create dataset to be consumed by the model + instantiate the NeuralNetwork:
+dataset = data_set.ChatDataset(X_train, y_train)
+model = linear_nn.NeuralNet(input_size, hidden_size, output_size).to(device)
+
+train_loader = DataLoader(dataset=dataset,
+                          batch_size=batch_size,
+                          shuffle=True,
+                          num_workers=2)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train the model
+for epoch in range(num_epochs):
+    for (words, labels) in train_loader:
+        words = words.to(device)
+        labels = labels.to(device)
+
+        # Forward pass
+        outputs = model(words.float())
+        loss = criterion(outputs, labels)
+
+        # Backward and optimize
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+    if (epoch+1) % 100 == 0:
+        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
+
+print(f'final loss: {loss.item():.4f}')
+
+# Save the model:
+data = {
+"model_state": model.state_dict(),
+"input_size": input_size,
+"hidden_size": hidden_size,
+"output_size": output_size,
+"tags": tags_unique
+}
+
+FILE = "data.pth"
+torch.save(data, FILE)
+
+print(f'training complete. file saved to {FILE}')