mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-23 06:40:47 -04:00
Clean up
This commit is contained in:
parent
bd7378ff91
commit
dd12687141
4 changed files with 40 additions and 29 deletions
|
@ -1 +1,5 @@
|
||||||
coming soon
|
This repo contains example python scripts for processing the reddit dump files created by pushshift. The files can be downloaded from [here](https://files.pushshift.io/reddit/) or torrented from [here](https://academictorrents.com/details/f37bb9c0abe350f0f1cbd4577d0fe413ed07724e).
|
||||||
|
|
||||||
|
* `single_file.py` decompresses and iterates over a single zst compressed file
|
||||||
|
* `iterate_folder.py` does the same, but for all files in a folder
|
||||||
|
* `combine_folder_multiprocess.py` uses separate processes to iterate over multiple files in parallel, writing lines that match the criteria passed in to text files, then combining them into a final zst compressed file
|
|
@ -1,3 +1,24 @@
|
||||||
|
# this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line
|
||||||
|
# and passes it into the save_obj function, if it function returns true for a line, it's written out into a
|
||||||
|
# separate file for that month. After all the ndjson files are processed, it iterates through the resulting
|
||||||
|
# files and combines them into a final file.
|
||||||
|
|
||||||
|
# once complete, the combined file can easily be processed like
|
||||||
|
# with open(file_path, 'r') as file:
|
||||||
|
# for line in file:
|
||||||
|
# obj = json.loads(line)
|
||||||
|
|
||||||
|
# features:
|
||||||
|
# - multiple processes in parallel to maximize drive read and decompression
|
||||||
|
# - saves state as it completes each file and picks up where it stopped
|
||||||
|
# - detailed progress indicators
|
||||||
|
|
||||||
|
# examples:
|
||||||
|
# - get all comments that have a subreddit field (subreddit is the default) of "wallstreetbets"
|
||||||
|
# python3 combine_folder_multiprocess.py reddit/comments reddit_final --name wallstreetbets_comments --value wallstreetbets
|
||||||
|
# - get all comments that have an author field of Watchful1
|
||||||
|
# python3 combine_folder_multiprocess.py reddit/comments reddit_final --name watchful1_comments --field author --value Watchful1
|
||||||
|
|
||||||
import zstandard
|
import zstandard
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
@ -25,22 +46,6 @@ log_file_handler.setFormatter(log_formatter)
|
||||||
log.addHandler(log_file_handler)
|
log.addHandler(log_file_handler)
|
||||||
|
|
||||||
|
|
||||||
# this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line
|
|
||||||
# and passes it into the save_obj function, if it function returns true for a line, it's written out into a
|
|
||||||
# separate file for that month. After all the ndjson files are processed, it iterates through the resulting
|
|
||||||
# files and combines them into a final file.
|
|
||||||
|
|
||||||
# once complete, the combined file can easily be processed like
|
|
||||||
# with open(file_path, 'r') as file:
|
|
||||||
# for line in file:
|
|
||||||
# obj = json.loads(line)
|
|
||||||
|
|
||||||
# features:
|
|
||||||
# - multiple processes in parallel to maximize drive read and decompression
|
|
||||||
# - saves state as it completes each file and picks up where it stopped
|
|
||||||
# - detailed progress indicators
|
|
||||||
|
|
||||||
|
|
||||||
# convenience object used to pass status information between processes
|
# convenience object used to pass status information between processes
|
||||||
class FileConfig:
|
class FileConfig:
|
||||||
def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0):
|
def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0):
|
||||||
|
@ -179,7 +184,7 @@ if __name__ == '__main__':
|
||||||
log.setLevel(logging.DEBUG)
|
log.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
log.info(f"Loading files from: {args.input}")
|
log.info(f"Loading files from: {args.input}")
|
||||||
log.info(f"Writing output to: {(os.path.join(args.output, args.name + '.txt'))}")
|
log.info(f"Writing output to: {(os.path.join(args.output, args.name + '.zst'))}")
|
||||||
|
|
||||||
multiprocessing.set_start_method('spawn')
|
multiprocessing.set_start_method('spawn')
|
||||||
queue = multiprocessing.Manager().Queue()
|
queue = multiprocessing.Manager().Queue()
|
||||||
|
@ -294,21 +299,17 @@ if __name__ == '__main__':
|
||||||
log.info(f"Processing complete, combining {len(working_file_paths)} result files")
|
log.info(f"Processing complete, combining {len(working_file_paths)} result files")
|
||||||
|
|
||||||
output_lines = 0
|
output_lines = 0
|
||||||
output_file_path = os.path.join(args.output, args.name + ".txt")
|
output_file_path = os.path.join(args.output, args.name + ".zst")
|
||||||
# combine all the output files into the final results file
|
# combine all the output files into the final results file
|
||||||
with open(output_file_path, 'w') as output_file:
|
with open(output_file_path, 'w') as output_file:
|
||||||
i = 0
|
files_combined = 0
|
||||||
|
writer = zstandard.ZstdCompressor().stream_writer(output_file)
|
||||||
for working_file_path in working_file_paths:
|
for working_file_path in working_file_paths:
|
||||||
i += 1
|
files_combined += 1
|
||||||
log.info(f"Reading {i}/{len(working_file_paths)}")
|
log.info(f"Reading {files_combined}/{len(working_file_paths)}")
|
||||||
with open(working_file_path, 'r') as input_file:
|
with open(working_file_path, 'r') as input_file:
|
||||||
for line in input_file.readlines():
|
for line in input_file.readlines():
|
||||||
output_lines += 1
|
output_lines += 1
|
||||||
output_file.write(line)
|
writer.write(line.encode('utf-8'))
|
||||||
|
|
||||||
log.info(f"Finished combining files, {output_lines:,} lines written to {output_file_path}")
|
log.info(f"Finished combining files, {output_lines:,} lines written to {output_file_path}")
|
||||||
|
|
||||||
|
|
||||||
# test file sorting
|
|
||||||
# compress results
|
|
||||||
# example command line call in comment
|
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
# this is an example of iterating over all zst files in a single folder,
|
||||||
|
# decompressing them and reading the created_utc field to make sure the files
|
||||||
|
# are intact. It has no output other than the number of lines
|
||||||
|
|
||||||
import zstandard
|
import zstandard
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# this is an example of loading and iterating over a single file
|
||||||
|
|
||||||
import zstandard
|
import zstandard
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
@ -29,7 +31,7 @@ def read_lines_zst(file_name):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = r"\\MYCLOUDPR4100\Public\reddit\submissions\RS_2013-03.zst"
|
file_path = sys.argv[1]
|
||||||
file_size = os.stat(file_path).st_size
|
file_size = os.stat(file_path).st_size
|
||||||
file_lines = 0
|
file_lines = 0
|
||||||
file_bytes_processed = 0
|
file_bytes_processed = 0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue