def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__:20} {str(result.shape):10} {time_taken:20}")
        return result
    return wrapperpreprocessing
logging function
log_step
log_step (func)
convert to lowercase
@log_step
def conv_to_lowerc(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .assign(body=lambda x: x['body'].str.lower())conv_to_lowerc
conv_to_lowerc (comments:pandas.core.frame.DataFrame)
test_lower = pd.DataFrame(
    columns=['body', 'result'],
    data=[['Test', 'test'],
          ['TESTS', 'tests']]
)test_lower.pipe(conv_to_lowerc)conv_to_lowerc       (2, 2)     0:00:00.002159      
| body | result | |
|---|---|---|
| 0 | test | test | 
| 1 | tests | tests | 
pdt.assert_series_equal(conv_to_lowerc(test_lower)['body'], test_lower['result'], check_names=False)conv_to_lowerc       (2, 2)     0:00:00.000774      
remove punctuation
test_rm_punct = pd.DataFrame(
    columns=['body', 'result'],
    data=[
            ['No-punctuation!', 'No punctuation ']
    ]
)@log_step
def rm_punct(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
        .assign(body=lambda x: x['body'].str.replace(r'[^\w\s]+', ' ', regex=True))rm_punct
rm_punct (comments:pandas.core.frame.DataFrame)
rm_punct(test_rm_punct)rm_punct             (1, 2)     0:00:00.000935      
| body | result | |
|---|---|---|
| 0 | No punctuation | No punctuation | 
pdt.assert_series_equal(
    rm_punct(test_rm_punct)['body'],
    test_rm_punct['result'],
    check_names=False
)rm_punct             (1, 2)     0:00:00.000706      
tokenize
test_tokenize = pd.DataFrame(
    columns=['body', 'result'],
    data=[
            ['These are three-tokens ', ['These', 'are', 'three-tokens']]
    ]
)@log_step
def tokenize(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
    .assign(body = lambda x: x['body'].str.split())tokenize
tokenize (comments:pandas.core.frame.DataFrame)
tokenize(test_tokenize)tokenize             (1, 2)     0:00:00.000743      
| body | result | |
|---|---|---|
| 0 | [These, are, three-tokens] | [These, are, three-tokens] | 
pdt.assert_series_equal(
    tokenize(test_tokenize)['body'],
    test_tokenize['result'],
    check_names=False
)tokenize             (1, 2)     0:00:00.000654      
detect short documents
test_count_toks = pd.DataFrame(
    columns=['body', 'result'],
    data=[
        [['this' for i in range(5)], 5],
        [['this' for i in range(20)], 20]
    ]
)test_count_toks| body | result | |
|---|---|---|
| 0 | [this, this, this, this, this] | 5 | 
| 1 | [this, this, this, this, this, this, this, thi... | 20 | 
def count_toks(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .assign(toks=lambda x: x['body'].map(len))count_toks
count_toks (comments:pandas.core.frame.DataFrame)
count_toks(test_count_toks)| body | result | toks | |
|---|---|---|---|
| 0 | [this, this, this, this, this] | 5 | 5 | 
| 1 | [this, this, this, this, this, this, this, thi... | 20 | 20 | 
pdt.assert_series_equal(
    count_toks(test_count_toks)['toks'],
    test_count_toks['result'],
    check_names=False
)test_rem_short_comments = pd.DataFrame(
    columns=['body', 'result'],
    data=[
        [['this' for i in range(5)], 'short'],
    [['this' for i in range(20)], 'long'],
    ]
)test_rem_short_comments| body | result | |
|---|---|---|
| 0 | [this, this, this, this, this] | short | 
| 1 | [this, this, this, this, this, this, this, thi... | long | 
@log_step
def rem_short_comments(comments: pd.DataFrame, min_toks: int=10) -> pd.DataFrame:
    return comments\
            .pipe(count_toks)\
            .query('toks > @min_toks')\
            .drop('toks', axis=1)rem_short_comments
rem_short_comments (comments:pandas.core.frame.DataFrame, min_toks:int=10)
rem_short_comments(test_rem_short_comments)rem_short_comments   (1, 2)     0:00:00.004103      
| body | result | |
|---|---|---|
| 1 | [this, this, this, this, this, this, this, thi... | long | 
pdt.assert_frame_equal(
    rem_short_comments(test_rem_short_comments),
    test_rem_short_comments.query('result == "long"'),
    check_names=False
)rem_short_comments   (1, 2)     0:00:00.002157      
pipeline
test_pipe = pd.DataFrame(
    columns=['body', 'result', 'flag'],
    data=[
            ['This is just a test!', ['this', 'is', 'just', 'a', 'test'], False],
            ['This is just a much much much much much much much much much much much much much much much much longer test!', ['this', 'is', 'just', 'a', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'longer', 'test'], True]
    ]
)def clean_comments(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .pipe(conv_to_lowerc)\
            .pipe(rm_punct)\
            .pipe(tokenize)\
            .pipe(rem_short_comments)clean_comments
clean_comments (comments:pandas.core.frame.DataFrame)
clean_comments(test_pipe)conv_to_lowerc       (2, 3)     0:00:00.000878      
rm_punct             (2, 3)     0:00:00.002117      
tokenize             (2, 3)     0:00:00.000544      
rem_short_comments   (1, 3)     0:00:00.002891      
| body | result | flag | |
|---|---|---|---|
| 1 | [this, is, just, a, much, much, much, much, mu... | [this, is, just, a, much, much, much, much, mu... | True | 
test_pipe.query('flag == True')['result']1    [this, is, just, a, much, much, much, much, mu...
Name: result, dtype: object
pdt.assert_series_equal(
    clean_comments(test_pipe)['body'],
    test_pipe.query('flag == True')['result'],
    check_names=False
)conv_to_lowerc       (2, 3)     0:00:00.000769      
rm_punct             (2, 3)     0:00:00.000795      
tokenize             (2, 3)     0:00:00.000535      
rem_short_comments   (1, 3)     0:00:00.002410      
blacklist for lexemes
load_blacklist_lex
load_blacklist_lex (fpath:str='../../blacklist_lex.csv', propNouns:bool=True)