-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
56 lines (43 loc) · 1.58 KB
/
preprocess.py
File metadata and controls
56 lines (43 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
import utility as util
class process():
def __init__(self) -> None:
pass
def tokenize(self, doc)->str:
lst = []
lst = [token.text for token in doc]
return " ".join(lst)
def lower(self, doc)->str:
lst = []
lst = [token.text.lower() for token in doc]
return " ".join(lst)
def del_punct(self, doc)->str:
lst = []
lst = [token.text for token in doc if not token.is_punct]
return " ".join(lst)
def del_stop(self, doc)->str:
lst = []
lst = [token.text for token in doc if not token.is_stop]
return " ".join(lst)
def del_dt(self, doc)->str:
lst = []
lst = [ent.text for ent in doc.ents if not ent.label_ == "DATE" and not ent.label_ == "TIME"]
return " ".join(lst)
def remove_ws(self, text:str)->str:
text = text.strip()
return " ".join(text.split())
def remove_url(self, text:str)->str:
pattern = r"((http|ftp|https):\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])?"
return re.sub(pattern, "", text)
def remove_html(self, text:str)->str:
pattern = r"<[^>]+>"
return re.sub(pattern, "", text)
def lemma(self, doc)->str:
lst = []
lst = [token.lemma_.strip() for token in doc]
return " ".join(lst)
def pos(self, doc)->str:
lst = []
allowed_pos = ["PROPN", "NOUN", "VERB", "ADJ", "ADV"]
lst = [token.text for token in doc if token.pos_ in allowed_pos]
return " ".join(lst)