preprocessing

logging function

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__:20} {str(result.shape):10} {time_taken:20}")
        return result
    return wrapper

source

log_step

 log_step (func)

convert to lowercase

@log_step
def conv_to_lowerc(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .assign(body=lambda x: x['body'].str.lower())

source

conv_to_lowerc

 conv_to_lowerc (comments:pandas.core.frame.DataFrame)
test_lower = pd.DataFrame(
    columns=['body', 'result'],
    data=[['Test', 'test'],
          ['TESTS', 'tests']]
)
test_lower.pipe(conv_to_lowerc)
conv_to_lowerc       (2, 2)     0:00:00.002159      
body result
0 test test
1 tests tests
pdt.assert_series_equal(conv_to_lowerc(test_lower)['body'], test_lower['result'], check_names=False)
conv_to_lowerc       (2, 2)     0:00:00.000774      

remove punctuation

test_rm_punct = pd.DataFrame(
    columns=['body', 'result'],
    data=[
            ['No-punctuation!', 'No punctuation ']
    ]
)
@log_step
def rm_punct(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
        .assign(body=lambda x: x['body'].str.replace(r'[^\w\s]+', ' ', regex=True))

source

rm_punct

 rm_punct (comments:pandas.core.frame.DataFrame)
rm_punct(test_rm_punct)
rm_punct             (1, 2)     0:00:00.000935      
body result
0 No punctuation No punctuation
pdt.assert_series_equal(
    rm_punct(test_rm_punct)['body'],
    test_rm_punct['result'],
    check_names=False
)
rm_punct             (1, 2)     0:00:00.000706      

tokenize

test_tokenize = pd.DataFrame(
    columns=['body', 'result'],
    data=[
            ['These are three-tokens ', ['These', 'are', 'three-tokens']]
    ]
)
@log_step
def tokenize(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
    .assign(body = lambda x: x['body'].str.split())

source

tokenize

 tokenize (comments:pandas.core.frame.DataFrame)
tokenize(test_tokenize)
tokenize             (1, 2)     0:00:00.000743      
body result
0 [These, are, three-tokens] [These, are, three-tokens]
pdt.assert_series_equal(
    tokenize(test_tokenize)['body'],
    test_tokenize['result'],
    check_names=False
)
tokenize             (1, 2)     0:00:00.000654      

detect short documents

test_count_toks = pd.DataFrame(
    columns=['body', 'result'],
    data=[
        [['this' for i in range(5)], 5],
        [['this' for i in range(20)], 20]
    ]
)
test_count_toks
body result
0 [this, this, this, this, this] 5
1 [this, this, this, this, this, this, this, thi... 20
def count_toks(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .assign(toks=lambda x: x['body'].map(len))

source

count_toks

 count_toks (comments:pandas.core.frame.DataFrame)
count_toks(test_count_toks)
body result toks
0 [this, this, this, this, this] 5 5
1 [this, this, this, this, this, this, this, thi... 20 20
pdt.assert_series_equal(
    count_toks(test_count_toks)['toks'],
    test_count_toks['result'],
    check_names=False
)
test_rem_short_comments = pd.DataFrame(
    columns=['body', 'result'],
    data=[
        [['this' for i in range(5)], 'short'],
    [['this' for i in range(20)], 'long'],
    ]
)
test_rem_short_comments
body result
0 [this, this, this, this, this] short
1 [this, this, this, this, this, this, this, thi... long
@log_step
def rem_short_comments(comments: pd.DataFrame, min_toks: int=10) -> pd.DataFrame:
    return comments\
            .pipe(count_toks)\
            .query('toks > @min_toks')\
            .drop('toks', axis=1)

source

rem_short_comments

 rem_short_comments (comments:pandas.core.frame.DataFrame,
                     min_toks:int=10)
rem_short_comments(test_rem_short_comments)
rem_short_comments   (1, 2)     0:00:00.004103      
body result
1 [this, this, this, this, this, this, this, thi... long
pdt.assert_frame_equal(
    rem_short_comments(test_rem_short_comments),
    test_rem_short_comments.query('result == "long"'),
    check_names=False
)
rem_short_comments   (1, 2)     0:00:00.002157      

pipeline

test_pipe = pd.DataFrame(
    columns=['body', 'result', 'flag'],
    data=[
            ['This is just a test!', ['this', 'is', 'just', 'a', 'test'], False],
            ['This is just a much much much much much much much much much much much much much much much much longer test!', ['this', 'is', 'just', 'a', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'longer', 'test'], True]
    ]
)
def clean_comments(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .pipe(conv_to_lowerc)\
            .pipe(rm_punct)\
            .pipe(tokenize)\
            .pipe(rem_short_comments)

source

clean_comments

 clean_comments (comments:pandas.core.frame.DataFrame)
clean_comments(test_pipe)
conv_to_lowerc       (2, 3)     0:00:00.000878      
rm_punct             (2, 3)     0:00:00.002117      
tokenize             (2, 3)     0:00:00.000544      
rem_short_comments   (1, 3)     0:00:00.002891      
body result flag
1 [this, is, just, a, much, much, much, much, mu... [this, is, just, a, much, much, much, much, mu... True
test_pipe.query('flag == True')['result']
1    [this, is, just, a, much, much, much, much, mu...
Name: result, dtype: object
pdt.assert_series_equal(
    clean_comments(test_pipe)['body'],
    test_pipe.query('flag == True')['result'],
    check_names=False
)
conv_to_lowerc       (2, 3)     0:00:00.000769      
rm_punct             (2, 3)     0:00:00.000795      
tokenize             (2, 3)     0:00:00.000535      
rem_short_comments   (1, 3)     0:00:00.002410      

blacklist for lexemes


source

load_blacklist_lex

 load_blacklist_lex (fpath:str='../../blacklist_lex.csv',
                     propNouns:bool=True)