read data

create test data

This repository only contains a sample of the original data because of size constraints. Below, we create a sample of the original data.

from tqdm.notebook import tqdm

src_dir = Path('../../data')
dest_dir = Path('../data_test')

n = 100

for subdir in src_dir.rglob('*'):
    if subdir.is_dir():
        dest_subdir = dest_dir / subdir.relative_to(src_dir)
        dest_subdir.mkdir(parents=True, exist_ok=True)
        for file_path in tqdm(list(subdir.glob('*.csv'))):
            df = pd.read_csv(file_path, on_bad_lines='skip', engine='python')
            df_sample = df.sample(n=min(n, len(df)), random_state=58)
            sample_file_path = dest_subdir / file_path.name
            df_sample.to_csv(sample_file_path, index=False)

read data

get file paths

def get_fpaths_year(year: str,
            dir='../data_test/years'
            ) -> list: 
    dir_path = Path(dir)
    return list(dir_path.glob(f'{year}*.csv'))

source

get_fpaths_year

 get_fpaths_year (year:str, dir='../data_test/years')
get_fpaths_year('2020')
[PosixPath('../data_test/years/2020-04-14_21:20:57___2020-04-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-04-07_21:19:06___2020-04-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-14_21:19:36___2020-06-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-10-14_21:19:48___2020-10-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-07_21:19:08___2020-06-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-02-07_22:18:35___2020-02-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-07-14_21:22:47___2020-07-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-01_21:59:59___2020-06-01_21:22:24.csv'),
 PosixPath('../data_test/years/2020-09-19_21:14:30___2020-09-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-07-01_21:59:59___2020-07-01_21:23:38.csv'),
 PosixPath('../data_test/years/2020-10-19_14:18:54___2020-10-19_14:58:31.csv'),
 PosixPath('../data_test/years/2020-02-01_22:59:59___2020-02-01_22:07:59.csv'),
 PosixPath('../data_test/years/2020-08-14_21:23:04___2020-08-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-03-19_22:14:44___2020-03-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-01_22:59:59___2020-11-01_22:20:51.csv'),
 PosixPath('../data_test/years/2020-01-01_22:59:59___2020-01-01_22:13:27.csv'),
 PosixPath('../data_test/years/2020-07-19_21:19:00___2020-07-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-10-07_21:20:42___2020-10-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-08-01_21:59:59___2020-08-01_21:17:27.csv'),
 PosixPath('../data_test/years/2020-03-07_22:37:38___2020-03-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-05-14_21:22:43___2020-05-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-12-07_22:21:23___2020-12-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-03-14_22:09:26___2020-03-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-05-01_21:59:59___2020-05-01_21:19:50.csv'),
 PosixPath('../data_test/years/2020-01-19_22:39:11___2020-01-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-08-07_21:22:06___2020-08-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-09-07_21:19:15___2020-09-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-07-07_21:22:39___2020-07-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-04-19_21:17:49___2020-04-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-01-14_22:13:57___2020-01-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-19_22:27:49___2020-11-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-12-01_22:59:59___2020-12-01_22:22:27.csv'),
 PosixPath('../data_test/years/2020-05-07_21:22:03___2020-05-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-01-07_22:21:30___2020-01-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-07_22:19:48___2020-11-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-14_22:15:34___2020-11-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-09-14_21:31:19___2020-09-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-05-19_21:22:04___2020-05-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-09-01_21:59:59___2020-09-01_21:24:54.csv'),
 PosixPath('../data_test/years/2020-10-01_21:59:59___2020-10-01_21:20:26.csv'),
 PosixPath('../data_test/years/2020-12-19_22:29:40___2020-12-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-03-01_22:59:59___2020-03-01_22:11:57.csv'),
 PosixPath('../data_test/years/2020-04-01_21:59:59___2020-04-01_21:18:25.csv'),
 PosixPath('../data_test/years/2020-12-14_22:21:47___2020-12-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-02-14_22:23:09___2020-02-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-02-19_22:14:28___2020-02-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-06-19_21:18:29___2020-06-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-08-19_21:22:40___2020-08-19_21:59:59.csv')]
def get_fpaths_subreddit(subreddit: str, dir: str='../data_test/subreddits') -> list: 
    dir_path = Path(dir)
    return list(dir_path.glob(f'{subreddit}*.csv'))

source

get_fpaths_subreddit

 get_fpaths_subreddit (subreddit:str, dir:str='../data_test/subreddits')
get_fpaths_subreddit('conspiracy')
[PosixPath('../data_test/subreddits/conspiracy___2020-11-17_11:02:26___2020-11-27_22:59:54.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-03-01_23:00:02___2020-03-09_22:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-01-03_19:39:57___2020-01-27_22:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-04-17_04:25:29___2020-04-27_21:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-05-14_00:35:50___2020-05-27_21:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-02-06_03:54:59___2020-02-27_22:59:57.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-07-01_22:00:04___2020-07-09_21:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-11-01_23:00:04___2020-11-09_22:59:56.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-02-01_23:00:04___2020-02-09_22:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-08-01_22:00:01___2020-08-09_21:59:56.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-10-16_22:19:53___2020-10-27_22:59:46.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-09-01_22:00:03___2020-09-09_21:59:56.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-03-17_15:00:57___2020-03-27_22:59:52.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-05-01_22:00:03___2020-05-09_21:59:48.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-06-15_13:59:56___2020-06-27_21:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-07-18_14:50:04___2020-07-27_21:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-12-15_22:12:50___2020-12-27_22:59:52.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-04-01_22:00:01___2020-04-09_21:59:49.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-09-13_14:27:06___2020-09-27_21:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-12-01_23:00:02___2020-12-09_22:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-06-01_22:00:04___2020-06-09_21:59:54.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-01-01_23:00:03___2020-01-09_22:59:54.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-10-01_22:00:10___2020-10-09_21:59:57.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-08-17_21:01:44___2020-08-27_21:59:57.csv')]

read comments

read a single csv file of comments

fpath = get_fpaths_year('2019')[0]
def read_one_comments_csv(fpath: str) -> pd.DataFrame:
    try:
        comments = pd.read_csv(
            fpath,
            usecols=['id', 'created_utc', 'author', 'subreddit', 'body'],
            dtype={
                'id': 'string',
                'author': 'string',
                'subreddit': 'string',
                'body': 'string'
            },
            parse_dates=['created_utc'],
            low_memory=False,
            lineterminator='\n'
        )
        comments_clean = comments\
            .dropna()\
            .drop_duplicates(subset='id')
        return comments_clean
    except FileNotFoundError:
        print(f'{fpath} not found on disk')
    except pd.errors.EmptyDataError:
        print(f'{fpath} is empty')

source

read_one_comments_csv

 read_one_comments_csv (fpath:str)
read_one_comments_csv(fpath)
author body created_utc id subreddit
0 lilfooty This'll hurt them more than the loss 2019-05-07 21:55:57 emrz5jp soccer
1 Kaeneko I loved vampires *so* much, lol. Always fantas... 2019-05-07 21:34:12 emrx5eq BDSMcommunity
2 Les_Deplorables Poor Zombies gonna starve. No Brains! 2019-05-07 21:21:11 emrvxjq The_Donald
3 viper2544 No one is going to mention the $12 shipping? 2019-05-07 21:56:45 emrz8g7 legostarwars
4 ninjasquirrelarmy Agreed. I showed my stylist the Phoenix hair ... 2019-05-07 21:34:43 emrx730 Instagramreality
... ... ... ... ... ...
95 kleptominotaur Is tj still a muscle pharm dude? or parm? what... 2019-05-07 21:35:05 emrx88m MMA
96 bonesstackedonfloor Fidgeting 2019-05-07 21:45:27 emry5y5 AskReddit
97 Perfectoi Imagine thinking EV will be sacked. Friendly r... 2019-05-07 21:18:06 emrvmx0 Barca
98 BB-Zwei And Dumbo. 2019-05-07 21:51:35 emryq8t movies
99 Thomas2PP How did he got it??? 2019-05-07 21:15:42 emrveqf hearthstone

100 rows × 5 columns

read multiple csv files with comments

def read_multi_comments_csvs(fpaths: list) -> pd.DataFrame:
    comments_lst = []
    for fpath in fpaths:
        comments = read_one_comments_csv(fpath)
        comments_lst.append(comments)
    comments_concat = pd.concat(
        comments_lst,
        axis=0,
        ignore_index=True
    )
    return comments_concat

source

read_multi_comments_csvs

 read_multi_comments_csvs (fpaths:list)
fpaths = get_fpaths_year('2019')
read_multi_comments_csvs(fpaths)
author body created_utc id subreddit
0 lilfooty This'll hurt them more than the loss 2019-05-07 21:55:57 emrz5jp soccer
1 Kaeneko I loved vampires *so* much, lol. Always fantas... 2019-05-07 21:34:12 emrx5eq BDSMcommunity
2 Les_Deplorables Poor Zombies gonna starve. No Brains! 2019-05-07 21:21:11 emrvxjq The_Donald
3 viper2544 No one is going to mention the $12 shipping? 2019-05-07 21:56:45 emrz8g7 legostarwars
4 ninjasquirrelarmy Agreed. I showed my stylist the Phoenix hair ... 2019-05-07 21:34:43 emrx730 Instagramreality
... ... ... ... ... ...
4794 m00sedad Donald Fucking Trump 2019-06-19 21:12:28 erl5gls AskReddit
4795 Abramabundiz obviously the office or parks, or maybe a spin... 2019-06-19 21:35:15 erl7gic AskReddit
4796 StarrySkye3 That sounds like someone who argues that other... 2019-06-19 21:33:57 erl7ccj otherkin
4797 mostoriginalusername I hadn't heard about that one. :) 2019-06-19 21:41:22 erl7zzj catsareliquid
4798 ggkiyo I won't lie, I was someone who said that dota ... 2019-06-19 21:54:45 erl96fb Games

4799 rows × 5 columns