preprocessing

logging function

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__:20} {str(result.shape):10} {time_taken:20}")
        return result
    return wrapper

source

log_step

 log_step (func)

convert to lowercase

@log_step
def conv_to_lowerc(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .assign(body=lambda x: x['body'].str.lower())

source

conv_to_lowerc

 conv_to_lowerc (comments:pandas.core.frame.DataFrame)

test_lower = pd.DataFrame(
    columns=['body', 'result'],
    data=[['Test', 'test'],
          ['TESTS', 'tests']]
)

test_lower.pipe(conv_to_lowerc)

conv_to_lowerc       (2, 2)     0:00:00.002159

	body	result
0	test	test
1	tests	tests

pdt.assert_series_equal(conv_to_lowerc(test_lower)['body'], test_lower['result'], check_names=False)

conv_to_lowerc       (2, 2)     0:00:00.000774

remove punctuation

test_rm_punct = pd.DataFrame(
    columns=['body', 'result'],
    data=[
            ['No-punctuation!', 'No punctuation ']
    ]
)

@log_step
def rm_punct(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
        .assign(body=lambda x: x['body'].str.replace(r'[^\w\s]+', ' ', regex=True))

source

rm_punct

 rm_punct (comments:pandas.core.frame.DataFrame)

rm_punct(test_rm_punct)

rm_punct             (1, 2)     0:00:00.000935

	body	result
0	No punctuation	No punctuation

pdt.assert_series_equal(
    rm_punct(test_rm_punct)['body'],
    test_rm_punct['result'],
    check_names=False
)

rm_punct             (1, 2)     0:00:00.000706

tokenize

test_tokenize = pd.DataFrame(
    columns=['body', 'result'],
    data=[
            ['These are three-tokens ', ['These', 'are', 'three-tokens']]
    ]
)

@log_step
def tokenize(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
    .assign(body = lambda x: x['body'].str.split())

source

tokenize

 tokenize (comments:pandas.core.frame.DataFrame)

tokenize(test_tokenize)

tokenize             (1, 2)     0:00:00.000743

	body	result
0	[These, are, three-tokens]	[These, are, three-tokens]

pdt.assert_series_equal(
    tokenize(test_tokenize)['body'],
    test_tokenize['result'],
    check_names=False
)

tokenize             (1, 2)     0:00:00.000654

detect short documents

test_count_toks = pd.DataFrame(
    columns=['body', 'result'],
    data=[
        [['this' for i in range(5)], 5],
        [['this' for i in range(20)], 20]
    ]
)

test_count_toks

	body	result
0	[this, this, this, this, this]	5
1	[this, this, this, this, this, this, this, thi...	20

def count_toks(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .assign(toks=lambda x: x['body'].map(len))

source

count_toks

 count_toks (comments:pandas.core.frame.DataFrame)

count_toks(test_count_toks)

	body	result	toks
0	[this, this, this, this, this]	5	5
1	[this, this, this, this, this, this, this, thi...	20	20

pdt.assert_series_equal(
    count_toks(test_count_toks)['toks'],
    test_count_toks['result'],
    check_names=False
)

test_rem_short_comments = pd.DataFrame(
    columns=['body', 'result'],
    data=[
        [['this' for i in range(5)], 'short'],
    [['this' for i in range(20)], 'long'],
    ]
)

test_rem_short_comments

	body	result
0	[this, this, this, this, this]	short
1	[this, this, this, this, this, this, this, thi...	long

@log_step
def rem_short_comments(comments: pd.DataFrame, min_toks: int=10) -> pd.DataFrame:
    return comments\
            .pipe(count_toks)\
            .query('toks > @min_toks')\
            .drop('toks', axis=1)

source

rem_short_comments

 rem_short_comments (comments:pandas.core.frame.DataFrame,
                     min_toks:int=10)

rem_short_comments(test_rem_short_comments)

rem_short_comments   (1, 2)     0:00:00.004103

	body	result
1	[this, this, this, this, this, this, this, thi...	long

pdt.assert_frame_equal(
    rem_short_comments(test_rem_short_comments),
    test_rem_short_comments.query('result == "long"'),
    check_names=False
)

rem_short_comments   (1, 2)     0:00:00.002157

pipeline

test_pipe = pd.DataFrame(
    columns=['body', 'result', 'flag'],
    data=[
            ['This is just a test!', ['this', 'is', 'just', 'a', 'test'], False],
            ['This is just a much much much much much much much much much much much much much much much much longer test!', ['this', 'is', 'just', 'a', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'much', 'longer', 'test'], True]
    ]
)

def clean_comments(comments: pd.DataFrame) -> pd.DataFrame:
    return comments\
            .pipe(conv_to_lowerc)\
            .pipe(rm_punct)\
            .pipe(tokenize)\
            .pipe(rem_short_comments)

source

clean_comments

 clean_comments (comments:pandas.core.frame.DataFrame)

clean_comments(test_pipe)

conv_to_lowerc       (2, 3)     0:00:00.000878      
rm_punct             (2, 3)     0:00:00.002117      
tokenize             (2, 3)     0:00:00.000544      
rem_short_comments   (1, 3)     0:00:00.002891

	body	result	flag
1	[this, is, just, a, much, much, much, much, mu...	[this, is, just, a, much, much, much, much, mu...	True

test_pipe.query('flag == True')['result']

1    [this, is, just, a, much, much, much, much, mu...
Name: result, dtype: object

pdt.assert_series_equal(
    clean_comments(test_pipe)['body'],
    test_pipe.query('flag == True')['result'],
    check_names=False
)

conv_to_lowerc       (2, 3)     0:00:00.000769      
rm_punct             (2, 3)     0:00:00.000795      
tokenize             (2, 3)     0:00:00.000535      
rem_short_comments   (1, 3)     0:00:00.002410

blacklist for lexemes

source

load_blacklist_lex

 load_blacklist_lex (fpath:str='../../blacklist_lex.csv',
                     propNouns:bool=True)