mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 23:55:18 -04:00
Fix comments
This commit is contained in:
parent
dd12687141
commit
021d033732
1 changed files with 5 additions and 9 deletions
|
@ -1,12 +1,7 @@
|
||||||
# this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line
|
# this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line
|
||||||
# and passes it into the save_obj function, if it function returns true for a line, it's written out into a
|
# and if it matches the criteria in the command line arguments, it's written out into a separate file for
|
||||||
# separate file for that month. After all the ndjson files are processed, it iterates through the resulting
|
# that month. After all the ndjson files are processed, it iterates through the resulting files and combines
|
||||||
# files and combines them into a final file.
|
# them into a final file.
|
||||||
|
|
||||||
# once complete, the combined file can easily be processed like
|
|
||||||
# with open(file_path, 'r') as file:
|
|
||||||
# for line in file:
|
|
||||||
# obj = json.loads(line)
|
|
||||||
|
|
||||||
# features:
|
# features:
|
||||||
# - multiple processes in parallel to maximize drive read and decompression
|
# - multiple processes in parallel to maximize drive read and decompression
|
||||||
|
@ -133,7 +128,8 @@ def read_lines_zst(file_name):
|
||||||
|
|
||||||
|
|
||||||
# base of each separate process. Loads a file, iterates through lines and writes out
|
# base of each separate process. Loads a file, iterates through lines and writes out
|
||||||
# the ones where save_obj() returns true. Also passes status information back to the parent via a queue
|
# the ones where the `field` of the object matches `value`. Also passes status
|
||||||
|
# information back to the parent via a queue
|
||||||
def process_file(file, working_folder, queue, field, value):
|
def process_file(file, working_folder, queue, field, value):
|
||||||
output_file = None
|
output_file = None
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue