search-engine/Preprocess.py at main · arminZolfaghari/search-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import json

import pandas as pd
from hazm import *
from DictList import *
# from __future__ import unicode_literals
import DictList

MAIN_DF_FILE_NAME = "./IR1_7k_news.xlsx"
normalizer = Normalizer()
stemmer = Stemmer()


def read_data_from_file(file_name=MAIN_DF_FILE_NAME):
    data_frame = pd.read_excel(file_name, sheet_name='Sheet1', usecols="A,C,D")
    return data_frame


df_before_preprocess = read_data_from_file(MAIN_DF_FILE_NAME)


def save_data_frame_to_file(data_frame):
    data_frame.to_excel("training_data.xlsx")


def normalize(sentence):
    normalized_sentence = normalizer.normalize(sentence)
    return normalized_sentence


# get sentence then return a list of tokens with indexes
def tokenize(sentence, type):
    tokens = word_tokenize(sentence)

    if type == "positional":
        # create dictionary tokens with indexes in sentences
        tokens_index_dict = DictList.DictList()
        for index in range(len(tokens)):
            tokens_index_dict[tokens[index]] = index
        return tokens_index_dict

    else:  # non positional
        tokens_list = tokens
        return tokens_list


def stem(words_dict, type):
    """
    get list of words and return list of stem words
    :param type:
    :param words_dict: dictionary of words with index
    :return new_words_dict: return dictionary of words stem with index
    """

    if type == "positional":
        new_words_dict = DictList.DictList()
        for key, value in words_dict.items():
            word_stem = stemmer.stem(key)
            for i in range(len(value)):
                new_words_dict[word_stem] = value[i]

        return new_words_dict

    else:  # type = non positional
        new_tokens_list = []
        for token in words_dict:
            stem_token = stemmer.stem(token)
            new_tokens_list.append(stem_token)

        return new_tokens_list


def remove_stop_words(words_dict, type):
    """
    :param type:
    :param words_dict: dictionary of words with index
    :return new_words_dict: return dictionary of words with index and without stop words
    """

    stop_words = list(set(stopwords_list()).intersection(list(words_dict)))
    if type == "positional":
        # new_words_dict = copy.deepcopy(words_dict)
        for word in stop_words:
            del words_dict[word]

    else:  # type = non positional
        for word in stop_words:
            words_dict.remove(word)

    return words_dict


def preprocess(data_frame, type, remove_stop_words_flag=False, stem_flag=False):
    new_data_frame = data_frame

    # step1: Normalize
    new_data_frame['content'] = data_frame['content'].apply(normalize)

    # step2: Tokenization
    new_data_frame['tokens'] = new_data_frame['content'].apply(tokenize, type=type)
    # print(new_data_frame["tokens"][5])

    # step3: Stemming
    if stem_flag:
        new_data_frame['tokens'] = new_data_frame['tokens'].apply(stem, type=type)
    # print(new_data_frame["tokens"][5])

    # print(len(new_data_frame["tokens"][0]))
    # step4: Stop words
    if remove_stop_words_flag:
        new_data_frame['tokens'] = new_data_frame['tokens'].apply(remove_stop_words, type=type)
        # print(len(new_data_frame["tokens"][0]))

    return new_data_frame


def preprocess_query(query, type, remove_stop_words_flag=False, stem_flag=False):
    # step1: Normalize
    normalize_query = normalize(query)

    # step2: Tokenization
    query_tokens_dict = tokenize(normalize_query, type)

    # step3: Stemming
    if stem_flag:
        query_tokens_dict = stem(query_tokens_dict, type)

    # step4: Stop words
    if remove_stop_words_flag:
        query_tokens_dict = remove_stop_words(query_tokens_dict, type)

    return query_tokens_dict


def preprocess_word(word):
    # step1: Normalize
    new_word = normalizer.normalize(word)

    # step3: Stemming
    new_word = stemmer.stem(new_word)

    return new_word


def get_data_frame_after_preprocess(type, remove_stop_words_flag, stem_flag):
    data_frame = read_data_from_file(MAIN_DF_FILE_NAME)
    return preprocess(data_frame, type, remove_stop_words_flag, stem_flag)


if __name__ == "__main__":
    print()
    df = read_data_from_file()
    data_frame_after_preprocess = preprocess(df, "non positional", True, True)
    # save_data_frame_to_file(data_frame_after_preprocess)

    # training_data_list = []
    # for doc_id, row in data_frame_after_preprocess.iterrows():
    #     training_data_list.append(row["tokens"])
    #
    # with open("training_data.json", 'w', encoding='utf-8') as fp:
    #     json.dump(training_data_list, fp, sort_keys=True, indent=4, ensure_ascii=False)