from tqdm.notebook import tqdm
= Path('../../data')
src_dir = Path('../data_test')
dest_dir
= 100
n
for subdir in src_dir.rglob('*'):
if subdir.is_dir():
= dest_dir / subdir.relative_to(src_dir)
dest_subdir =True, exist_ok=True)
dest_subdir.mkdir(parentsfor file_path in tqdm(list(subdir.glob('*.csv'))):
= pd.read_csv(file_path, on_bad_lines='skip', engine='python')
df = df.sample(n=min(n, len(df)), random_state=58)
df_sample = dest_subdir / file_path.name
sample_file_path =False) df_sample.to_csv(sample_file_path, index
read data
create test data
This repository only contains a sample of the original data because of size constraints. Below, we create a sample of the original data.
read data
get file paths
def get_fpaths_year(year: str,
dir='../data_test/years'
-> list:
) = Path(dir)
dir_path return list(dir_path.glob(f'{year}*.csv'))
get_fpaths_year
get_fpaths_year (year:str, dir='../data_test/years')
'2020') get_fpaths_year(
[PosixPath('../data_test/years/2020-04-14_21:20:57___2020-04-14_21:59:59.csv'),
PosixPath('../data_test/years/2020-04-07_21:19:06___2020-04-07_21:59:59.csv'),
PosixPath('../data_test/years/2020-06-14_21:19:36___2020-06-14_21:59:59.csv'),
PosixPath('../data_test/years/2020-10-14_21:19:48___2020-10-14_21:59:59.csv'),
PosixPath('../data_test/years/2020-06-07_21:19:08___2020-06-07_21:59:59.csv'),
PosixPath('../data_test/years/2020-02-07_22:18:35___2020-02-07_22:59:59.csv'),
PosixPath('../data_test/years/2020-07-14_21:22:47___2020-07-14_21:59:59.csv'),
PosixPath('../data_test/years/2020-06-01_21:59:59___2020-06-01_21:22:24.csv'),
PosixPath('../data_test/years/2020-09-19_21:14:30___2020-09-19_21:59:59.csv'),
PosixPath('../data_test/years/2020-07-01_21:59:59___2020-07-01_21:23:38.csv'),
PosixPath('../data_test/years/2020-10-19_14:18:54___2020-10-19_14:58:31.csv'),
PosixPath('../data_test/years/2020-02-01_22:59:59___2020-02-01_22:07:59.csv'),
PosixPath('../data_test/years/2020-08-14_21:23:04___2020-08-14_21:59:59.csv'),
PosixPath('../data_test/years/2020-03-19_22:14:44___2020-03-19_22:59:59.csv'),
PosixPath('../data_test/years/2020-11-01_22:59:59___2020-11-01_22:20:51.csv'),
PosixPath('../data_test/years/2020-01-01_22:59:59___2020-01-01_22:13:27.csv'),
PosixPath('../data_test/years/2020-07-19_21:19:00___2020-07-19_21:59:59.csv'),
PosixPath('../data_test/years/2020-10-07_21:20:42___2020-10-07_21:59:59.csv'),
PosixPath('../data_test/years/2020-08-01_21:59:59___2020-08-01_21:17:27.csv'),
PosixPath('../data_test/years/2020-03-07_22:37:38___2020-03-07_22:59:59.csv'),
PosixPath('../data_test/years/2020-05-14_21:22:43___2020-05-14_21:59:59.csv'),
PosixPath('../data_test/years/2020-12-07_22:21:23___2020-12-07_22:59:59.csv'),
PosixPath('../data_test/years/2020-03-14_22:09:26___2020-03-14_22:59:59.csv'),
PosixPath('../data_test/years/2020-05-01_21:59:59___2020-05-01_21:19:50.csv'),
PosixPath('../data_test/years/2020-01-19_22:39:11___2020-01-19_22:59:59.csv'),
PosixPath('../data_test/years/2020-08-07_21:22:06___2020-08-07_21:59:59.csv'),
PosixPath('../data_test/years/2020-09-07_21:19:15___2020-09-07_21:59:59.csv'),
PosixPath('../data_test/years/2020-07-07_21:22:39___2020-07-07_21:59:59.csv'),
PosixPath('../data_test/years/2020-04-19_21:17:49___2020-04-19_21:59:59.csv'),
PosixPath('../data_test/years/2020-01-14_22:13:57___2020-01-14_22:59:59.csv'),
PosixPath('../data_test/years/2020-11-19_22:27:49___2020-11-19_22:59:59.csv'),
PosixPath('../data_test/years/2020-12-01_22:59:59___2020-12-01_22:22:27.csv'),
PosixPath('../data_test/years/2020-05-07_21:22:03___2020-05-07_21:59:59.csv'),
PosixPath('../data_test/years/2020-01-07_22:21:30___2020-01-07_22:59:59.csv'),
PosixPath('../data_test/years/2020-11-07_22:19:48___2020-11-07_22:59:59.csv'),
PosixPath('../data_test/years/2020-11-14_22:15:34___2020-11-14_22:59:59.csv'),
PosixPath('../data_test/years/2020-09-14_21:31:19___2020-09-14_21:59:59.csv'),
PosixPath('../data_test/years/2020-05-19_21:22:04___2020-05-19_21:59:59.csv'),
PosixPath('../data_test/years/2020-09-01_21:59:59___2020-09-01_21:24:54.csv'),
PosixPath('../data_test/years/2020-10-01_21:59:59___2020-10-01_21:20:26.csv'),
PosixPath('../data_test/years/2020-12-19_22:29:40___2020-12-19_22:59:59.csv'),
PosixPath('../data_test/years/2020-03-01_22:59:59___2020-03-01_22:11:57.csv'),
PosixPath('../data_test/years/2020-04-01_21:59:59___2020-04-01_21:18:25.csv'),
PosixPath('../data_test/years/2020-12-14_22:21:47___2020-12-14_22:59:59.csv'),
PosixPath('../data_test/years/2020-02-14_22:23:09___2020-02-14_22:59:59.csv'),
PosixPath('../data_test/years/2020-02-19_22:14:28___2020-02-19_22:59:59.csv'),
PosixPath('../data_test/years/2020-06-19_21:18:29___2020-06-19_21:59:59.csv'),
PosixPath('../data_test/years/2020-08-19_21:22:40___2020-08-19_21:59:59.csv')]
def get_fpaths_subreddit(subreddit: str, dir: str='../data_test/subreddits') -> list:
= Path(dir)
dir_path return list(dir_path.glob(f'{subreddit}*.csv'))
get_fpaths_subreddit
get_fpaths_subreddit (subreddit:str, dir:str='../data_test/subreddits')
'conspiracy') get_fpaths_subreddit(
[PosixPath('../data_test/subreddits/conspiracy___2020-11-17_11:02:26___2020-11-27_22:59:54.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-03-01_23:00:02___2020-03-09_22:59:59.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-01-03_19:39:57___2020-01-27_22:59:58.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-04-17_04:25:29___2020-04-27_21:59:55.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-05-14_00:35:50___2020-05-27_21:59:58.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-02-06_03:54:59___2020-02-27_22:59:57.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-07-01_22:00:04___2020-07-09_21:59:58.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-11-01_23:00:04___2020-11-09_22:59:56.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-02-01_23:00:04___2020-02-09_22:59:55.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-08-01_22:00:01___2020-08-09_21:59:56.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-10-16_22:19:53___2020-10-27_22:59:46.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-09-01_22:00:03___2020-09-09_21:59:56.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-03-17_15:00:57___2020-03-27_22:59:52.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-05-01_22:00:03___2020-05-09_21:59:48.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-06-15_13:59:56___2020-06-27_21:59:55.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-07-18_14:50:04___2020-07-27_21:59:59.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-12-15_22:12:50___2020-12-27_22:59:52.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-04-01_22:00:01___2020-04-09_21:59:49.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-09-13_14:27:06___2020-09-27_21:59:59.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-12-01_23:00:02___2020-12-09_22:59:59.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-06-01_22:00:04___2020-06-09_21:59:54.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-01-01_23:00:03___2020-01-09_22:59:54.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-10-01_22:00:10___2020-10-09_21:59:57.csv'),
PosixPath('../data_test/subreddits/conspiracy___2020-08-17_21:01:44___2020-08-27_21:59:57.csv')]
read comments
read a single csv
file of comments
= get_fpaths_year('2019')[0] fpath
def read_one_comments_csv(fpath: str) -> pd.DataFrame:
try:
= pd.read_csv(
comments
fpath,=['id', 'created_utc', 'author', 'subreddit', 'body'],
usecols={
dtype'id': 'string',
'author': 'string',
'subreddit': 'string',
'body': 'string'
},=['created_utc'],
parse_dates=False,
low_memory='\n'
lineterminator
)= comments\
comments_clean \
.dropna()='id')
.drop_duplicates(subsetreturn comments_clean
except FileNotFoundError:
print(f'{fpath} not found on disk')
except pd.errors.EmptyDataError:
print(f'{fpath} is empty')
read_one_comments_csv
read_one_comments_csv (fpath:str)
read_one_comments_csv(fpath)
author | body | created_utc | id | subreddit | |
---|---|---|---|---|---|
0 | lilfooty | This'll hurt them more than the loss | 2019-05-07 21:55:57 | emrz5jp | soccer |
1 | Kaeneko | I loved vampires *so* much, lol. Always fantas... | 2019-05-07 21:34:12 | emrx5eq | BDSMcommunity |
2 | Les_Deplorables | Poor Zombies gonna starve. No Brains! | 2019-05-07 21:21:11 | emrvxjq | The_Donald |
3 | viper2544 | No one is going to mention the $12 shipping? | 2019-05-07 21:56:45 | emrz8g7 | legostarwars |
4 | ninjasquirrelarmy | Agreed. I showed my stylist the Phoenix hair ... | 2019-05-07 21:34:43 | emrx730 | Instagramreality |
... | ... | ... | ... | ... | ... |
95 | kleptominotaur | Is tj still a muscle pharm dude? or parm? what... | 2019-05-07 21:35:05 | emrx88m | MMA |
96 | bonesstackedonfloor | Fidgeting | 2019-05-07 21:45:27 | emry5y5 | AskReddit |
97 | Perfectoi | Imagine thinking EV will be sacked. Friendly r... | 2019-05-07 21:18:06 | emrvmx0 | Barca |
98 | BB-Zwei | And Dumbo. | 2019-05-07 21:51:35 | emryq8t | movies |
99 | Thomas2PP | How did he got it??? | 2019-05-07 21:15:42 | emrveqf | hearthstone |
100 rows × 5 columns
read multiple csv
files with comments
def read_multi_comments_csvs(fpaths: list) -> pd.DataFrame:
= []
comments_lst for fpath in fpaths:
= read_one_comments_csv(fpath)
comments
comments_lst.append(comments)= pd.concat(
comments_concat
comments_lst,=0,
axis=True
ignore_index
)return comments_concat
read_multi_comments_csvs
read_multi_comments_csvs (fpaths:list)
= get_fpaths_year('2019') fpaths
read_multi_comments_csvs(fpaths)
author | body | created_utc | id | subreddit | |
---|---|---|---|---|---|
0 | lilfooty | This'll hurt them more than the loss | 2019-05-07 21:55:57 | emrz5jp | soccer |
1 | Kaeneko | I loved vampires *so* much, lol. Always fantas... | 2019-05-07 21:34:12 | emrx5eq | BDSMcommunity |
2 | Les_Deplorables | Poor Zombies gonna starve. No Brains! | 2019-05-07 21:21:11 | emrvxjq | The_Donald |
3 | viper2544 | No one is going to mention the $12 shipping? | 2019-05-07 21:56:45 | emrz8g7 | legostarwars |
4 | ninjasquirrelarmy | Agreed. I showed my stylist the Phoenix hair ... | 2019-05-07 21:34:43 | emrx730 | Instagramreality |
... | ... | ... | ... | ... | ... |
4794 | m00sedad | Donald Fucking Trump | 2019-06-19 21:12:28 | erl5gls | AskReddit |
4795 | Abramabundiz | obviously the office or parks, or maybe a spin... | 2019-06-19 21:35:15 | erl7gic | AskReddit |
4796 | StarrySkye3 | That sounds like someone who argues that other... | 2019-06-19 21:33:57 | erl7ccj | otherkin |
4797 | mostoriginalusername | I hadn't heard about that one. :) | 2019-06-19 21:41:22 | erl7zzj | catsareliquid |
4798 | ggkiyo | I won't lie, I was someone who said that dota ... | 2019-06-19 21:54:45 | erl96fb | Games |
4799 rows × 5 columns