from tqdm.notebook import tqdm
src_dir = Path('../../data')
dest_dir = Path('../data_test')
n = 100
for subdir in src_dir.rglob('*'):
    if subdir.is_dir():
        dest_subdir = dest_dir / subdir.relative_to(src_dir)
        dest_subdir.mkdir(parents=True, exist_ok=True)
        for file_path in tqdm(list(subdir.glob('*.csv'))):
            df = pd.read_csv(file_path, on_bad_lines='skip', engine='python')
            df_sample = df.sample(n=min(n, len(df)), random_state=58)
            sample_file_path = dest_subdir / file_path.name
            df_sample.to_csv(sample_file_path, index=False)read data
create test data
This repository only contains a sample of the original data because of size constraints. Below, we create a sample of the original data.
read data
get file paths
def get_fpaths_year(year: str,
            dir='../data_test/years'
            ) -> list: 
    dir_path = Path(dir)
    return list(dir_path.glob(f'{year}*.csv'))get_fpaths_year
get_fpaths_year (year:str, dir='../data_test/years')
get_fpaths_year('2020')[PosixPath('../data_test/years/2020-04-14_21:20:57___2020-04-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-04-07_21:19:06___2020-04-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-14_21:19:36___2020-06-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-10-14_21:19:48___2020-10-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-07_21:19:08___2020-06-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-02-07_22:18:35___2020-02-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-07-14_21:22:47___2020-07-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-01_21:59:59___2020-06-01_21:22:24.csv'),
 PosixPath('../data_test/years/2020-09-19_21:14:30___2020-09-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-07-01_21:59:59___2020-07-01_21:23:38.csv'),
 PosixPath('../data_test/years/2020-10-19_14:18:54___2020-10-19_14:58:31.csv'),
 PosixPath('../data_test/years/2020-02-01_22:59:59___2020-02-01_22:07:59.csv'),
 PosixPath('../data_test/years/2020-08-14_21:23:04___2020-08-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-03-19_22:14:44___2020-03-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-01_22:59:59___2020-11-01_22:20:51.csv'),
 PosixPath('../data_test/years/2020-01-01_22:59:59___2020-01-01_22:13:27.csv'),
 PosixPath('../data_test/years/2020-07-19_21:19:00___2020-07-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-10-07_21:20:42___2020-10-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-08-01_21:59:59___2020-08-01_21:17:27.csv'),
 PosixPath('../data_test/years/2020-03-07_22:37:38___2020-03-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-05-14_21:22:43___2020-05-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-12-07_22:21:23___2020-12-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-03-14_22:09:26___2020-03-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-05-01_21:59:59___2020-05-01_21:19:50.csv'),
 PosixPath('../data_test/years/2020-01-19_22:39:11___2020-01-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-08-07_21:22:06___2020-08-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-09-07_21:19:15___2020-09-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-07-07_21:22:39___2020-07-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-04-19_21:17:49___2020-04-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-01-14_22:13:57___2020-01-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-19_22:27:49___2020-11-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-12-01_22:59:59___2020-12-01_22:22:27.csv'),
 PosixPath('../data_test/years/2020-05-07_21:22:03___2020-05-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-01-07_22:21:30___2020-01-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-07_22:19:48___2020-11-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-11-14_22:15:34___2020-11-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-09-14_21:31:19___2020-09-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-05-19_21:22:04___2020-05-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-09-01_21:59:59___2020-09-01_21:24:54.csv'),
 PosixPath('../data_test/years/2020-10-01_21:59:59___2020-10-01_21:20:26.csv'),
 PosixPath('../data_test/years/2020-12-19_22:29:40___2020-12-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-03-01_22:59:59___2020-03-01_22:11:57.csv'),
 PosixPath('../data_test/years/2020-04-01_21:59:59___2020-04-01_21:18:25.csv'),
 PosixPath('../data_test/years/2020-12-14_22:21:47___2020-12-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-02-14_22:23:09___2020-02-14_22:59:59.csv'),
 PosixPath('../data_test/years/2020-02-19_22:14:28___2020-02-19_22:59:59.csv'),
 PosixPath('../data_test/years/2020-06-19_21:18:29___2020-06-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-08-19_21:22:40___2020-08-19_21:59:59.csv')]
def get_fpaths_subreddit(subreddit: str, dir: str='../data_test/subreddits') -> list: 
    dir_path = Path(dir)
    return list(dir_path.glob(f'{subreddit}*.csv'))get_fpaths_subreddit
get_fpaths_subreddit (subreddit:str, dir:str='../data_test/subreddits')
get_fpaths_subreddit('conspiracy')[PosixPath('../data_test/subreddits/conspiracy___2020-11-17_11:02:26___2020-11-27_22:59:54.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-03-01_23:00:02___2020-03-09_22:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-01-03_19:39:57___2020-01-27_22:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-04-17_04:25:29___2020-04-27_21:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-05-14_00:35:50___2020-05-27_21:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-02-06_03:54:59___2020-02-27_22:59:57.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-07-01_22:00:04___2020-07-09_21:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-11-01_23:00:04___2020-11-09_22:59:56.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-02-01_23:00:04___2020-02-09_22:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-08-01_22:00:01___2020-08-09_21:59:56.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-10-16_22:19:53___2020-10-27_22:59:46.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-09-01_22:00:03___2020-09-09_21:59:56.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-03-17_15:00:57___2020-03-27_22:59:52.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-05-01_22:00:03___2020-05-09_21:59:48.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-06-15_13:59:56___2020-06-27_21:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-07-18_14:50:04___2020-07-27_21:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-12-15_22:12:50___2020-12-27_22:59:52.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-04-01_22:00:01___2020-04-09_21:59:49.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-09-13_14:27:06___2020-09-27_21:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-12-01_23:00:02___2020-12-09_22:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-06-01_22:00:04___2020-06-09_21:59:54.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-01-01_23:00:03___2020-01-09_22:59:54.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-10-01_22:00:10___2020-10-09_21:59:57.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-08-17_21:01:44___2020-08-27_21:59:57.csv')]
read comments
read a single csv file of comments
fpath = get_fpaths_year('2019')[0]def read_one_comments_csv(fpath: str) -> pd.DataFrame:
    try:
        comments = pd.read_csv(
            fpath,
            usecols=['id', 'created_utc', 'author', 'subreddit', 'body'],
            dtype={
                'id': 'string',
                'author': 'string',
                'subreddit': 'string',
                'body': 'string'
            },
            parse_dates=['created_utc'],
            low_memory=False,
            lineterminator='\n'
        )
        comments_clean = comments\
            .dropna()\
            .drop_duplicates(subset='id')
        return comments_clean
    except FileNotFoundError:
        print(f'{fpath} not found on disk')
    except pd.errors.EmptyDataError:
        print(f'{fpath} is empty')read_one_comments_csv
read_one_comments_csv (fpath:str)
read_one_comments_csv(fpath)| author | body | created_utc | id | subreddit | |
|---|---|---|---|---|---|
| 0 | lilfooty | This'll hurt them more than the loss | 2019-05-07 21:55:57 | emrz5jp | soccer | 
| 1 | Kaeneko | I loved vampires *so* much, lol. Always fantas... | 2019-05-07 21:34:12 | emrx5eq | BDSMcommunity | 
| 2 | Les_Deplorables | Poor Zombies gonna starve. No Brains! | 2019-05-07 21:21:11 | emrvxjq | The_Donald | 
| 3 | viper2544 | No one is going to mention the $12 shipping? | 2019-05-07 21:56:45 | emrz8g7 | legostarwars | 
| 4 | ninjasquirrelarmy | Agreed. I showed my stylist the Phoenix hair ... | 2019-05-07 21:34:43 | emrx730 | Instagramreality | 
| ... | ... | ... | ... | ... | ... | 
| 95 | kleptominotaur | Is tj still a muscle pharm dude? or parm? what... | 2019-05-07 21:35:05 | emrx88m | MMA | 
| 96 | bonesstackedonfloor | Fidgeting | 2019-05-07 21:45:27 | emry5y5 | AskReddit | 
| 97 | Perfectoi | Imagine thinking EV will be sacked. Friendly r... | 2019-05-07 21:18:06 | emrvmx0 | Barca | 
| 98 | BB-Zwei | And Dumbo. | 2019-05-07 21:51:35 | emryq8t | movies | 
| 99 | Thomas2PP | How did he got it??? | 2019-05-07 21:15:42 | emrveqf | hearthstone | 
100 rows × 5 columns
read multiple csv files with comments
def read_multi_comments_csvs(fpaths: list) -> pd.DataFrame:
    comments_lst = []
    for fpath in fpaths:
        comments = read_one_comments_csv(fpath)
        comments_lst.append(comments)
    comments_concat = pd.concat(
        comments_lst,
        axis=0,
        ignore_index=True
    )
    return comments_concatread_multi_comments_csvs
read_multi_comments_csvs (fpaths:list)
fpaths = get_fpaths_year('2019')read_multi_comments_csvs(fpaths)| author | body | created_utc | id | subreddit | |
|---|---|---|---|---|---|
| 0 | lilfooty | This'll hurt them more than the loss | 2019-05-07 21:55:57 | emrz5jp | soccer | 
| 1 | Kaeneko | I loved vampires *so* much, lol. Always fantas... | 2019-05-07 21:34:12 | emrx5eq | BDSMcommunity | 
| 2 | Les_Deplorables | Poor Zombies gonna starve. No Brains! | 2019-05-07 21:21:11 | emrvxjq | The_Donald | 
| 3 | viper2544 | No one is going to mention the $12 shipping? | 2019-05-07 21:56:45 | emrz8g7 | legostarwars | 
| 4 | ninjasquirrelarmy | Agreed. I showed my stylist the Phoenix hair ... | 2019-05-07 21:34:43 | emrx730 | Instagramreality | 
| ... | ... | ... | ... | ... | ... | 
| 4794 | m00sedad | Donald Fucking Trump | 2019-06-19 21:12:28 | erl5gls | AskReddit | 
| 4795 | Abramabundiz | obviously the office or parks, or maybe a spin... | 2019-06-19 21:35:15 | erl7gic | AskReddit | 
| 4796 | StarrySkye3 | That sounds like someone who argues that other... | 2019-06-19 21:33:57 | erl7ccj | otherkin | 
| 4797 | mostoriginalusername | I hadn't heard about that one. :) | 2019-06-19 21:41:22 | erl7zzj | catsareliquid | 
| 4798 | ggkiyo | I won't lie, I was someone who said that dota ... | 2019-06-19 21:54:45 | erl96fb | Games | 
4799 rows × 5 columns