mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-27 00:25:26 -04:00
Fix path name
This commit is contained in:
parent
143a40fc23
commit
5b95fc7e7d
2 changed files with 23 additions and 11 deletions
|
@ -326,8 +326,8 @@ if __name__ == '__main__':
|
||||||
input_lines += 1
|
input_lines += 1
|
||||||
monthly_counts[line.strip()] += 1
|
monthly_counts[line.strip()] += 1
|
||||||
|
|
||||||
file.monthly_count_file = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
|
file.count_file_path = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
|
||||||
with open(file.monthly_count_file, 'w') as output_handle:
|
with open(file.count_file_path, 'w') as output_handle:
|
||||||
for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True):
|
for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True):
|
||||||
output_handle.write(f"{field} {count}\n")
|
output_handle.write(f"{field} {count}\n")
|
||||||
|
|
||||||
|
@ -338,13 +338,13 @@ if __name__ == '__main__':
|
||||||
if stage == "agg":
|
if stage == "agg":
|
||||||
field_counts = defaultdict(int)
|
field_counts = defaultdict(int)
|
||||||
for file in input_files:
|
for file in input_files:
|
||||||
with open(file.monthly_count_file, 'r') as input_handle:
|
with open(file.count_file_path, 'r') as input_handle:
|
||||||
for line in input_handle:
|
for line in input_handle:
|
||||||
try:
|
try:
|
||||||
field, count = line.strip().split("\t")
|
field, count = line.strip().split("\t")
|
||||||
field_counts[field] = count
|
field_counts[field] = count
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
log.info(f"Line failed in file {file.monthly_count_file}: {line}")
|
log.info(f"Line failed in file {file.count_file_path}: {line}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True)
|
sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
|
|
@ -5,17 +5,18 @@ import sys
|
||||||
import csv
|
import csv
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging.handlers
|
import logging.handlers
|
||||||
|
import traceback
|
||||||
|
|
||||||
# put the path to the input file, or a folder of files to process all of
|
# put the path to the input file, or a folder of files to process all of
|
||||||
input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst"
|
input_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments.zst"
|
||||||
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
|
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
|
||||||
output_file = r"\\MYCLOUDPR4100\Public\output"
|
output_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments"
|
||||||
# the format to output in, pick from the following options
|
# the format to output in, pick from the following options
|
||||||
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
|
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
|
||||||
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
|
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
|
||||||
# csv: a comma separated value file. Can be opened by a text editor or excel
|
# csv: a comma separated value file. Can be opened by a text editor or excel
|
||||||
# WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
|
# WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
|
||||||
output_format = "csv"
|
output_format = "zst"
|
||||||
# override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
|
# override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
|
||||||
# any field that's in the dump is supported, but useful ones are
|
# any field that's in the dump is supported, but useful ones are
|
||||||
# author: the username of the author
|
# author: the username of the author
|
||||||
|
@ -76,12 +77,15 @@ to_date = datetime.strptime("2030-12-31", "%Y-%m-%d")
|
||||||
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
|
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
|
||||||
|
|
||||||
# change this to field = None if you don't want to filter by anything
|
# change this to field = None if you don't want to filter by anything
|
||||||
field = "body"
|
field = "selftext"
|
||||||
values = ['']
|
values = ['']
|
||||||
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
||||||
# if this list is very large, it could greatly slow down the process
|
# if this list is very large, it could greatly slow down the process
|
||||||
values_file = None
|
values_file = None
|
||||||
|
# if true, only match the full value, if false match the value anywhere in the text. Partial matches are much slower
|
||||||
exact_match = False
|
exact_match = False
|
||||||
|
# if true, returns rows that do not match the condition
|
||||||
|
inverse = False
|
||||||
|
|
||||||
|
|
||||||
# sets up logging to the console as well as a file
|
# sets up logging to the console as well as a file
|
||||||
|
@ -209,7 +213,10 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if field is not None:
|
if field is not None:
|
||||||
field_value = obj[field].lower()
|
field_value = obj[field]
|
||||||
|
if field_value is None:
|
||||||
|
continue
|
||||||
|
field_value = field_value.lower()
|
||||||
matched = False
|
matched = False
|
||||||
for value in values:
|
for value in values:
|
||||||
if exact_match:
|
if exact_match:
|
||||||
|
@ -220,9 +227,14 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
|
||||||
if value in field_value:
|
if value in field_value:
|
||||||
matched = True
|
matched = True
|
||||||
break
|
break
|
||||||
|
if inverse:
|
||||||
|
if matched:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
if not matched:
|
if not matched:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
matched_lines += 1
|
matched_lines += 1
|
||||||
if output_format == "zst":
|
if output_format == "zst":
|
||||||
write_line_zst(handle, line)
|
write_line_zst(handle, line)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue