-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess_column.py
More file actions
68 lines (54 loc) · 2.13 KB
/
process_column.py
File metadata and controls
68 lines (54 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
class TextProcessor:
def __init__(self):
self.stemmer = PorterStemmer()
# Tokenizer to split by space, period, underscore, and dash
self.tokenizer = RegexpTokenizer(r'[^ \._\-]+')
def tokenize(self, text):
"""Tokenizes the input text into words."""
return self.tokenizer.tokenize(text)
def case_fold(self, tokens):
"""Converts all tokens to lowercase."""
return [token.lower() for token in tokens]
def stem(self, tokens):
"""Applies stemming to the tokens."""
return [self.stemmer.stem(token) for token in tokens]
# operate on a list of items
# and normalize each item and return the modified list
def process(self, list_):
"""Combines tokenization, case folding, and stemming."""
processed_res=[]
# Tokenization
for item in list_:
temp=self.processString(item)
merged_string = ' '.join(temp)
processed_res.append(merged_string)
return processed_res
# tokenize input string and apply steps and retrun a list of tokens
def processString(self, text):
"""Combines tokenization, case folding, and stemming."""
# Tokenization
tokens = self.tokenize(text)
# Case Folding
tokens = self.case_fold(tokens)
# Stemming
stemmed_tokens = self.stem(tokens)
return stemmed_tokens
def processColumns(self, columns):
return [self.process(column) for column in columns]
def columnsToBagOfTokens(self, columns):
result=[]
for col in columns:
tokens=set()
for xgrams in col:
tokens.update(set(xgrams.split()))
result.append(tokens)
return result
# Example usage
if __name__ == "__main__":
text = "Tokenization is# - -the , process of breaking, down :text into smaller Units."
text="IT- Hardware Purchases"
processor = TextProcessor()
processed_text = processor.processString(text)
print(processed_text)