-
Notifications
You must be signed in to change notification settings - Fork 94
/
twitter_preprocessing_recipe.py
85 lines (67 loc) · 2.57 KB
/
twitter_preprocessing_recipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""Preprocess the tweets by normalising username, removing unnecessary punctuations, expanding the hashtags"""
import re
import os
import pandas as pd
import datatable as dt
from h2oaicore.systemutils import config
from h2oaicore.data import CustomData
text_colnames = ["text"]
output_dataset_name = "df_preprocessed"
_global_modules_needed_by_name = ["wordsegment"]
import wordsegment
class PreprocessDataClass(CustomData):
@staticmethod
def create_data(X: dt.Frame = None):
if X is None:
return []
fixup = process_tweets()
X = dt.Frame(X).to_pandas()
for text_colname in text_colnames:
X["preprocessed_" + text_colname] = X[text_colname].astype(str).apply(
lambda x: fixup.preprocess(x))
temp_path = os.path.join(config.data_directory, config.contrib_relative_directory)
os.makedirs(temp_path, exist_ok=True)
# Save files to disk
file_train = os.path.join(temp_path, output_dataset_name + ".csv")
X.to_csv(file_train, index=False)
return [file_train]
class process_tweets:
"""Class for Processing tweets"""
def __init__(self):
wordsegment.load()
self.segment = wordsegment.segment
@staticmethod
def currency_replace(text):
text = re.sub(r"\$", " dollar ", text)
text = re.sub(r"£", " pound ", text)
text = re.sub(r"€", " euro ", text)
text = re.sub(r"¥", " yen ", text)
text = re.sub(r"[¢₡₱₭₦]", " currency ", text)
return text
@staticmethod
def char_removing(text):
text = text.replace("http://url.removed", "")
text = re.sub(r"[ं-ో̇]", "", text)
text = re.sub(r"[•]", "", text)
text = re.sub(r"[】【]", "", text)
text = re.sub(r"[\{\}\(\)\[\]]+", " ", text)
text = re.sub(r"[*/\&|_<>~\+=\-\^™\\\%]+", " ", text)
text = re.sub(r"[;:…]+", " ", text)
return text
def fix_hashtag(self, text):
hashtags = re.findall(r"(#\w+)", text)
for hashtag in hashtags:
processed_hashtag = '# ' + (' '.join(self.segment(hashtag)))
text = text.replace(hashtag, processed_hashtag)
return text
@staticmethod
def fix_username(text):
text = re.sub(r"@[a-zA-Z0-9]+", "@username", text)
return text
def preprocess(self, text):
text = self.currency_replace(text)
text = self.char_removing(text)
text = self.fix_hashtag(text)
text = self.fix_username(text)
text = re.sub(r" +", " ", text)
return text