def log_step(func):
@wraps(func)
def wrapper(*args, **kwargs):
tic = dt.datetime.now()
result = func(*args, **kwargs)
time_taken = str(dt.datetime.now() - tic)
print(f"{func.__name__:20} {str(result.shape):10} {time_taken:20}")
return result
return wrapperpreprocessing
logging function
log_step
log_step (func)
convert to lowercase
@log_step
def conv_to_lowerc(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
.assign(body=lambda x: x['body'].str.lower())conv_to_lowerc
conv_to_lowerc (comments:pandas.core.frame.DataFrame)
test_lower = pd.DataFrame(
columns=['body', 'result'],
data=[['Test', 'test'],
['TESTS', 'tests']]
)test_lower.pipe(conv_to_lowerc)conv_to_lowerc (2, 2) 0:00:00.002159
| body | result | |
|---|---|---|
| 0 | test | test |
| 1 | tests | tests |
pdt.assert_series_equal(conv_to_lowerc(test_lower)['body'], test_lower['result'], check_names=False)conv_to_lowerc (2, 2) 0:00:00.000774
remove punctuation
test_rm_punct = pd.DataFrame(
columns=['body', 'result'],
data=[
['No-punctuation!', 'No punctuation ']
]
)@log_step
def rm_punct(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
.assign(body=lambda x: x['body'].str.replace(r'[^\w\s]+', ' ', regex=True))rm_punct
rm_punct (comments:pandas.core.frame.DataFrame)
rm_punct(test_rm_punct)rm_punct (1, 2) 0:00:00.000935
| body | result | |
|---|---|---|
| 0 | No punctuation | No punctuation |
pdt.assert_series_equal(
rm_punct(test_rm_punct)['body'],
test_rm_punct['result'],
check_names=False
)rm_punct (1, 2) 0:00:00.000706
tokenize
test_tokenize = pd.DataFrame(
columns=['body', 'result'],
data=[
['These are three-tokens ', ['These', 'are', 'three-tokens']]
]
)@log_step
def tokenize(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
.assign(body = lambda x: x['body'].str.split())tokenize
tokenize (comments:pandas.core.frame.DataFrame)
tokenize(test_tokenize)tokenize (1, 2) 0:00:00.000743
| body | result | |
|---|---|---|
| 0 | [These, are, three-tokens] | [These, are, three-tokens] |
pdt.assert_series_equal(
tokenize(test_tokenize)['body'],
test_tokenize['result'],
check_names=False
)tokenize (1, 2) 0:00:00.000654
detect short documents
test_count_toks = pd.DataFrame(
columns=['body', 'result'],
data=[
[['this' for i in range(5)], 5],
[['this' for i in range(20)], 20]
]
)test_count_toks| body | result | |
|---|---|---|
| 0 | [this, this, this, this, this] | 5 |
| 1 | [this, this, this, this, this, this, this, thi... | 20 |
def count_toks(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
.assign(toks=lambda x: x['body'].map(len))count_toks
count_toks (comments:pandas.core.frame.DataFrame)
count_toks(test_count_toks)| body | result | toks | |
|---|---|---|---|
| 0 | [this, this, this, this, this] | 5 | 5 |
| 1 | [this, this, this, this, this, this, this, thi... | 20 | 20 |
pdt.assert_series_equal(
count_toks(test_count_toks)['toks'],
test_count_toks['result'],
check_names=False
)test_rem_short_comments = pd.DataFrame(
columns=['body', 'result'],
data=[
[['this' for i in range(5)], 'short'],
[['this' for i in range(20)], 'long'],
]
)test_rem_short_comments| body | result | |
|---|---|---|
| 0 | [this, this, this, this, this] | short |
| 1 | [this, this, this, this, this, this, this, thi... | long |
@log_step
def rem_short_comments(comments: pd.DataFrame, min_toks: int=10) -> pd.DataFrame:
return comments\
.pipe(count_toks)\
.query('toks > @min_toks')\
.drop('toks', axis=1)rem_short_comments
rem_short_comments (comments:pandas.core.frame.DataFrame, min_toks:int=10)
rem_short_comments(test_rem_short_comments)rem_short_comments (1, 2) 0:00:00.004103
| body | result | |
|---|---|---|
| 1 | [this, this, this, this, this, this, this, thi... | long |
pdt.assert_frame_equal(
rem_short_comments(test_rem_short_comments),
test_rem_short_comments.query('result == "long"'),
check_names=False
)rem_short_comments (1, 2) 0:00:00.002157
pipeline
test_pipe = pd.DataFrame(
columns=['body', 'result', 'flag'],
data=[
['This is just a test!', ['this', 'is', 'just', 'a', 'test'], False],
['This is just a much much much much much much much much much much much much much much much much longer test!', ['this', 'is', 'just', 'a', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'longer', 'test'], True]
]
)def clean_comments(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
.pipe(conv_to_lowerc)\
.pipe(rm_punct)\
.pipe(tokenize)\
.pipe(rem_short_comments)clean_comments
clean_comments (comments:pandas.core.frame.DataFrame)
clean_comments(test_pipe)conv_to_lowerc (2, 3) 0:00:00.000878
rm_punct (2, 3) 0:00:00.002117
tokenize (2, 3) 0:00:00.000544
rem_short_comments (1, 3) 0:00:00.002891
| body | result | flag | |
|---|---|---|---|
| 1 | [this, is, just, a, much, much, much, much, mu... | [this, is, just, a, much, much, much, much, mu... | True |
test_pipe.query('flag == True')['result']1 [this, is, just, a, much, much, much, much, mu...
Name: result, dtype: object
pdt.assert_series_equal(
clean_comments(test_pipe)['body'],
test_pipe.query('flag == True')['result'],
check_names=False
)conv_to_lowerc (2, 3) 0:00:00.000769
rm_punct (2, 3) 0:00:00.000795
tokenize (2, 3) 0:00:00.000535
rem_short_comments (1, 3) 0:00:00.002410
blacklist for lexemes
load_blacklist_lex
load_blacklist_lex (fpath:str='../../blacklist_lex.csv', propNouns:bool=True)