-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreprocess.py
More file actions
161 lines (114 loc) · 4.53 KB
/
Preprocess.py
File metadata and controls
161 lines (114 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import json
import pandas as pd
from hazm import *
from DictList import *
# from __future__ import unicode_literals
import DictList
MAIN_DF_FILE_NAME = "./IR1_7k_news.xlsx"
normalizer = Normalizer()
stemmer = Stemmer()
def read_data_from_file(file_name=MAIN_DF_FILE_NAME):
data_frame = pd.read_excel(file_name, sheet_name='Sheet1', usecols="A,C,D")
return data_frame
df_before_preprocess = read_data_from_file(MAIN_DF_FILE_NAME)
def save_data_frame_to_file(data_frame):
data_frame.to_excel("training_data.xlsx")
def normalize(sentence):
normalized_sentence = normalizer.normalize(sentence)
return normalized_sentence
# get sentence then return a list of tokens with indexes
def tokenize(sentence, type):
tokens = word_tokenize(sentence)
if type == "positional":
# create dictionary tokens with indexes in sentences
tokens_index_dict = DictList.DictList()
for index in range(len(tokens)):
tokens_index_dict[tokens[index]] = index
return tokens_index_dict
else: # non positional
tokens_list = tokens
return tokens_list
def stem(words_dict, type):
"""
get list of words and return list of stem words
:param type:
:param words_dict: dictionary of words with index
:return new_words_dict: return dictionary of words stem with index
"""
if type == "positional":
new_words_dict = DictList.DictList()
for key, value in words_dict.items():
word_stem = stemmer.stem(key)
for i in range(len(value)):
new_words_dict[word_stem] = value[i]
return new_words_dict
else: # type = non positional
new_tokens_list = []
for token in words_dict:
stem_token = stemmer.stem(token)
new_tokens_list.append(stem_token)
return new_tokens_list
def remove_stop_words(words_dict, type):
"""
:param type:
:param words_dict: dictionary of words with index
:return new_words_dict: return dictionary of words with index and without stop words
"""
stop_words = list(set(stopwords_list()).intersection(list(words_dict)))
if type == "positional":
# new_words_dict = copy.deepcopy(words_dict)
for word in stop_words:
del words_dict[word]
else: # type = non positional
for word in stop_words:
words_dict.remove(word)
return words_dict
def preprocess(data_frame, type, remove_stop_words_flag=False, stem_flag=False):
new_data_frame = data_frame
# step1: Normalize
new_data_frame['content'] = data_frame['content'].apply(normalize)
# step2: Tokenization
new_data_frame['tokens'] = new_data_frame['content'].apply(tokenize, type=type)
# print(new_data_frame["tokens"][5])
# step3: Stemming
if stem_flag:
new_data_frame['tokens'] = new_data_frame['tokens'].apply(stem, type=type)
# print(new_data_frame["tokens"][5])
# print(len(new_data_frame["tokens"][0]))
# step4: Stop words
if remove_stop_words_flag:
new_data_frame['tokens'] = new_data_frame['tokens'].apply(remove_stop_words, type=type)
# print(len(new_data_frame["tokens"][0]))
return new_data_frame
def preprocess_query(query, type, remove_stop_words_flag=False, stem_flag=False):
# step1: Normalize
normalize_query = normalize(query)
# step2: Tokenization
query_tokens_dict = tokenize(normalize_query, type)
# step3: Stemming
if stem_flag:
query_tokens_dict = stem(query_tokens_dict, type)
# step4: Stop words
if remove_stop_words_flag:
query_tokens_dict = remove_stop_words(query_tokens_dict, type)
return query_tokens_dict
def preprocess_word(word):
# step1: Normalize
new_word = normalizer.normalize(word)
# step3: Stemming
new_word = stemmer.stem(new_word)
return new_word
def get_data_frame_after_preprocess(type, remove_stop_words_flag, stem_flag):
data_frame = read_data_from_file(MAIN_DF_FILE_NAME)
return preprocess(data_frame, type, remove_stop_words_flag, stem_flag)
if __name__ == "__main__":
print()
df = read_data_from_file()
data_frame_after_preprocess = preprocess(df, "non positional", True, True)
# save_data_frame_to_file(data_frame_after_preprocess)
# training_data_list = []
# for doc_id, row in data_frame_after_preprocess.iterrows():
# training_data_list.append(row["tokens"])
#
# with open("training_data.json", 'w', encoding='utf-8') as fp:
# json.dump(training_data_list, fp, sort_keys=True, indent=4, ensure_ascii=False)