def log_step(func):
@wraps(func)
def wrapper(*args, **kwargs):
= dt.datetime.now()
tic = func(*args, **kwargs)
result = str(dt.datetime.now() - tic)
time_taken print(f"{func.__name__:20} {str(result.shape):10} {time_taken:20}")
return result
return wrapper
preprocessing
logging function
log_step
log_step (func)
convert to lowercase
@log_step
def conv_to_lowerc(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
=lambda x: x['body'].str.lower()) .assign(body
conv_to_lowerc
conv_to_lowerc (comments:pandas.core.frame.DataFrame)
= pd.DataFrame(
test_lower =['body', 'result'],
columns=[['Test', 'test'],
data'TESTS', 'tests']]
[ )
test_lower.pipe(conv_to_lowerc)
conv_to_lowerc (2, 2) 0:00:00.002159
body | result | |
---|---|---|
0 | test | test |
1 | tests | tests |
'body'], test_lower['result'], check_names=False) pdt.assert_series_equal(conv_to_lowerc(test_lower)[
conv_to_lowerc (2, 2) 0:00:00.000774
remove punctuation
= pd.DataFrame(
test_rm_punct =['body', 'result'],
columns=[
data'No-punctuation!', 'No punctuation ']
[
] )
@log_step
def rm_punct(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
=lambda x: x['body'].str.replace(r'[^\w\s]+', ' ', regex=True)) .assign(body
rm_punct
rm_punct (comments:pandas.core.frame.DataFrame)
rm_punct(test_rm_punct)
rm_punct (1, 2) 0:00:00.000935
body | result | |
---|---|---|
0 | No punctuation | No punctuation |
pdt.assert_series_equal('body'],
rm_punct(test_rm_punct)['result'],
test_rm_punct[=False
check_names )
rm_punct (1, 2) 0:00:00.000706
tokenize
= pd.DataFrame(
test_tokenize =['body', 'result'],
columns=[
data'These are three-tokens ', ['These', 'are', 'three-tokens']]
[
] )
@log_step
def tokenize(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
= lambda x: x['body'].str.split()) .assign(body
tokenize
tokenize (comments:pandas.core.frame.DataFrame)
tokenize(test_tokenize)
tokenize (1, 2) 0:00:00.000743
body | result | |
---|---|---|
0 | [These, are, three-tokens] | [These, are, three-tokens] |
pdt.assert_series_equal('body'],
tokenize(test_tokenize)['result'],
test_tokenize[=False
check_names )
tokenize (1, 2) 0:00:00.000654
detect short documents
= pd.DataFrame(
test_count_toks =['body', 'result'],
columns=[
data'this' for i in range(5)], 5],
[['this' for i in range(20)], 20]
[[
] )
test_count_toks
body | result | |
---|---|---|
0 | [this, this, this, this, this] | 5 |
1 | [this, this, this, this, this, this, this, thi... | 20 |
def count_toks(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
=lambda x: x['body'].map(len)) .assign(toks
count_toks
count_toks (comments:pandas.core.frame.DataFrame)
count_toks(test_count_toks)
body | result | toks | |
---|---|---|---|
0 | [this, this, this, this, this] | 5 | 5 |
1 | [this, this, this, this, this, this, this, thi... | 20 | 20 |
pdt.assert_series_equal('toks'],
count_toks(test_count_toks)['result'],
test_count_toks[=False
check_names )
= pd.DataFrame(
test_rem_short_comments =['body', 'result'],
columns=[
data'this' for i in range(5)], 'short'],
[['this' for i in range(20)], 'long'],
[[
] )
test_rem_short_comments
body | result | |
---|---|---|
0 | [this, this, this, this, this] | short |
1 | [this, this, this, this, this, this, this, thi... | long |
@log_step
def rem_short_comments(comments: pd.DataFrame, min_toks: int=10) -> pd.DataFrame:
return comments\
\
.pipe(count_toks)'toks > @min_toks')\
.query('toks', axis=1) .drop(
rem_short_comments
rem_short_comments (comments:pandas.core.frame.DataFrame, min_toks:int=10)
rem_short_comments(test_rem_short_comments)
rem_short_comments (1, 2) 0:00:00.004103
body | result | |
---|---|---|
1 | [this, this, this, this, this, this, this, thi... | long |
pdt.assert_frame_equal(
rem_short_comments(test_rem_short_comments),'result == "long"'),
test_rem_short_comments.query(=False
check_names )
rem_short_comments (1, 2) 0:00:00.002157
pipeline
= pd.DataFrame(
test_pipe =['body', 'result', 'flag'],
columns=[
data'This is just a test!', ['this', 'is', 'just', 'a', 'test'], False],
['This is just a much much much much much much much much much much much much much much much much longer test!', ['this', 'is', 'just', 'a', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'longer', 'test'], True]
[
] )
def clean_comments(comments: pd.DataFrame) -> pd.DataFrame:
return comments\
\
.pipe(conv_to_lowerc)\
.pipe(rm_punct)\
.pipe(tokenize) .pipe(rem_short_comments)
clean_comments
clean_comments (comments:pandas.core.frame.DataFrame)
clean_comments(test_pipe)
conv_to_lowerc (2, 3) 0:00:00.000878
rm_punct (2, 3) 0:00:00.002117
tokenize (2, 3) 0:00:00.000544
rem_short_comments (1, 3) 0:00:00.002891
body | result | flag | |
---|---|---|---|
1 | [this, is, just, a, much, much, much, much, mu... | [this, is, just, a, much, much, much, much, mu... | True |
'flag == True')['result'] test_pipe.query(
1 [this, is, just, a, much, much, much, much, mu...
Name: result, dtype: object
pdt.assert_series_equal('body'],
clean_comments(test_pipe)['flag == True')['result'],
test_pipe.query(=False
check_names )
conv_to_lowerc (2, 3) 0:00:00.000769
rm_punct (2, 3) 0:00:00.000795
tokenize (2, 3) 0:00:00.000535
rem_short_comments (1, 3) 0:00:00.002410
blacklist for lexemes
load_blacklist_lex
load_blacklist_lex (fpath:str='../../blacklist_lex.csv', propNouns:bool=True)