Wrong prediction when using lstm

Question

asked Jun 24, 2021 in Python by Nedisha (120 points)

I have implemented an emotion detection analysis using lstm, I have firstly trained my model with a dataset of reviews and its emotion, then I have implemented the predicting part where I have put my new dataset to predict emotion for each reviews. But the system is giving me wrong results. Can show please show me which part of the code is incorrect and help me to correct my errors so that I can have good results.. Please,, I am posing my codes below..

For the implementation, I have firstly trained my model and save it so that i can call it in my prediction part.

codes for my training part:

```

import pandas as pd #data processing, csv file I/O(e.g pd.read_csv)


df = pd.read_csv('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/goodhotelresult.csv')
print(df)

#Preparing data for model traininng
#Tokenization-Since the data is already tokenized and lowecased, we just need to split the words
input_sentences = [text.split(" ") for text in df["text"].values.tolist()]
labels = df["emotions"].values.tolist()

#creating vocabulary(word index)
#Initialize word2id and label2id dictionaries that will be used to encode words and labels
word2id = dict() #creating the dictionary named word2id
label2id = dict() #creating a dictionary named label2id

max_words = 0 #maximum number of words in a sentence

#construction of word2id
for sentence in input_sentences:
    for word in sentence:
        #Add words to word2id if not exist
        if word not in word2id:
            word2id[word] = len(word2id)
    #If length of the sentence is greater than max_words, update max_words
    if len(sentence) > max_words:
        max_words = len(sentence)

#Construction of label2id and id2label dictionaries
label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}
print(label2id)
print(id2label)
print(word2id)
#Encode samples with corresponding integer values
import keras

#Encode input words and labels

X = [[word2id[word] for word in sentence] for sentence in input_sentences]
Y = [label2id[label] for label in labels]

#Apply padding to X
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, max_words)

#Convert Y to numpy array
Y = keras.utils.to_categorical(Y, num_classes=len(label2id), dtype='float32')

#Print shapes
print("Shape of X: {}".format(X.shape))
print("Shape of Y: {}".format(Y.shape))

#Build LSTM Model with attention
embedding_dim = 100 # The dimension of word embeddings

#Define input tensor
sequence_input = keras.Input(shape=(max_words,), dtype='int32')

# Word embedding layer
embedded_inputs =keras.layers.Embedding(len(word2id) + 1,
                                        embedding_dim,
                                        input_length=max_words)(sequence_input)

# Apply dropout to prevent overfitting
embedded_inputs = keras.layers.Dropout(0.2)(embedded_inputs)

# Apply Bidirectional LSTM over embedded inputs
lstm_outs = keras.layers.wrappers.Bidirectional(
    keras.layers.LSTM(embedding_dim, return_sequences=True)
)(embedded_inputs)

# Apply dropout to LSTM outputs to prevent overfitting
lstm_outs = keras.layers.Dropout(0.2)(lstm_outs)

# Attention Mechanism - Generate attention vectors

attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)
attention_vector = keras.layers.Reshape((max_words,))(attention_vector)
attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)
attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])

# Last layer: fully connected with softmax activation
fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)
output = keras.layers.Dense(len(label2id), activation='softmax')(fc)

# Finally building model
model = keras.Model(inputs=[sequence_input], outputs=output)
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer='adam')

# Print model summary
model.summary()

# Train model 10 iterations
model.fit(X, Y, epochs=2, batch_size=64, validation_split=0.1, shuffle=True)
model.save('trainmodelsave.py')

```

Then I did this for my prediction part:

```

import pandas as pd #data processing, csv file I/O(e.g pd.read_csv) import json import re import string from nltk.corpus import stopwords import datetime from nltk.tokenize import word_tokenize def remove_stopwords(text): stop_words = set(stopwords.words("english")) word_tokens = word_tokenize(text) filtered_text = [word for word in word_tokens if word not in stop_words] return filtered_text from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize df = pd.read_csv('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/goodhotelresult.csv') #print('xxx',df) #Preparing data for model traininng #Tokenization-Since the data is already tokenized and lowecased, we just need to split the words input_sentences = [text.split(" ") for text in df["text"].values.tolist()] labels = df["emotions"].values.tolist() #creating vocabulary(word index) #Initialize word2id and label2id dictionaries that will be used to encode words and labels word2id = dict() #creating the dictionary named word2id label2id = dict() #creating a dictionary named label2id max_words = 0 #maximum number of words in a sentence #construction of word2id for sentence in input_sentences: for word in sentence: #Add words to word2id if not exist if word not in word2id: word2id[word] = len(word2id) #If length of the sentence is greater than max_words, update max_words if len(sentence) > max_words: max_words = len(sentence) #Construction of label2id and id2label dictionaries label2id = {l: i for i, l in enumerate(set(labels))} id2label = {v: k for k, v in label2id.items()} from keras.models import load_model model = load_model('trainmodelsave.py') print(model) import keras model_with_attentions = keras.Model(inputs=model.input, output=[model.output, model.get_layer('attention_vec').output]) ########################################################### #ADD FOR LOOP DIRECTLY HERE #File I/O Open function for read data from JSON File with open('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/all.json', encoding='utf8') as file_object: # store file data in object data = json.load(file_object) # dictionary for element which you want to keep new_data = {'selection1': []} print(new_data) # copy item from old data to new data if it has 'reviews' for item in data['selection1']: if 'reviews' in item: new_data['selection1'].append(item) print(item['reviews']) print('--') # save in file with open('output2.json', 'w') as f: json.dump(new_data, f) selection1 = new_data['selection1'] #creating empty list to be able to create a dataframe names = [] dates = [] commentss = [] labels = [] hotelname = [] for item in selection1: name = item['name'] hotelname.append(name) #print ('>>>>>>>>>>>>>>>>>> ', name) Date = item['reviews'] for d in Date: names.append(name) #convert date from 'january 12, 2020' to 2020-01-02 date = pd.to_datetime(d['date']).strftime("%Y-%m-%d") #adding date to the empty list dates[] dates.append(date) #print('>>>>>>>>>>>>>>>>>> ', date) CommentID = item['reviews'] for com in CommentID: comment = com['review'] lcomment = comment.lower() # converting all to lowercase result = re.sub(r'\d+', '', lcomment) # remove numbers results = (result.translate( str.maketrans('', '', string.punctuation))).strip() # remove punctuations and white spaces comments = remove_stopwords(results) commentss.append(comment) print('>>>>>>',comments) #add the words in comments that are already present in the keys of dictionary encoded_samples = [[word2id[word] for word in comments if word in word2id.keys()]] # Padding encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words) # Make predictions label_probs, attentions = model_with_attentions.predict(encoded_samples) label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(), label_probs[0])} labels.append(label_probs) #Get word attentions using attenion vector print(label_probs)

My results are like this:

>>>>>> ['wasnt', 'impressed', 'poor', 'choice', 'anything', 'inclusive', 'option', 'snaks', 'day', 'poor', 'choice', 'free', 'drinks', 'mini', 'bar', 'even', 'coffee', 'payable', 'entertainment', 'evening', 'surprising', 'would', 'come', 'back', 'price', 'go', 'maritim', 'much', 'much', 'better'] {'happy': 0.30163398, 'enjoy': 0.12936097, 'sadness': 0.018949889, 'trust': 0.19013356, 'joy': 0.08350239, 'disgust': 0.13000967, 'anger': 0.14640959} >>>>>> ['bad', 'experience', 'lazy', 'employees', 'specially', 'boat', 'house', 'food', 'horrible', 'took', 'long', 'get', 'food', 'door', 'room', 'blockedoverall', 'service', 'hotel', 'poor'] {'happy': 0.08209001, 'enjoy': 0.26885188, 'sadness': 0.017319722, 'trust': 0.3754914, 'joy': 0.04761887, 'disgust': 0.059040256, 'anger': 0.14958787} >>>>>> ['hotel', 'nice', 'lack', 'staff', 'lunch', 'indication', 'hotel', 'situated', 'road'] {'happy': 0.6219977, 'enjoy': 0.046003498, 'sadness': 0.0028672628, 'trust': 0.04223141, 'joy': 0.079679504, 'disgust': 0.14226186, 'anger': 0.06495871} >>>>>> ['impressed', 'service', 'got', 'mari', 'deal', 'quick', 'efficient', 'hotel', 'turned', 'exactly', 'promised', 'lovely', 'time', 'warm', 'welcome', 'great', 'food'] {'happy': 0.91739607, 'enjoy': 0.0040345276, 'sadness': 4.3373333e-05, 'trust': 0.0034020818, 'joy': 0.01539256, 'disgust': 0.052098893, 'anger': 0.007632463} >>>>>> ['great', 'stay', 'thank', 'azuri', 'team']

```

i am having wrong results..what do i need to do to correct this guys.. please help me.

Please log in to answer this question.

Wrong prediction when using lstm

0 Answers

Related questions

Browse Categories