mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 15:45:19 -04:00
Didn't mean to commit that
This commit is contained in:
parent
ef186b7bd7
commit
ec977a76b2
2 changed files with 14 additions and 6 deletions
|
@ -7,9 +7,9 @@ from datetime import datetime
|
||||||
import logging.handlers
|
import logging.handlers
|
||||||
|
|
||||||
# put the path to the input file, or a folder of files to process all of
|
# put the path to the input file, or a folder of files to process all of
|
||||||
input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_23.zst"
|
input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments.zst"
|
||||||
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
|
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
|
||||||
output_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_hero"
|
output_file = r"\\MYCLOUDPR4100\Public\output"
|
||||||
# the format to output in, pick from the following options
|
# the format to output in, pick from the following options
|
||||||
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
|
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
|
||||||
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
|
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
|
||||||
|
@ -79,7 +79,7 @@ field = "body"
|
||||||
values = ['']
|
values = ['']
|
||||||
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
||||||
# if this list is very large, it could greatly slow down the process
|
# if this list is very large, it could greatly slow down the process
|
||||||
values_file = r"\\MYCLOUDPR4100\Public\askreddit_submissions_ids.txt"
|
values_file = None
|
||||||
exact_match = False
|
exact_match = False
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,10 +7,13 @@ import zstandard
|
||||||
import json
|
import json
|
||||||
|
|
||||||
input_files = [
|
input_files = [
|
||||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\baseballcards_comments.zst",
|
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\trading212_comments.zst",
|
||||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\classicwow_comments.zst",
|
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Fire_comments.zst",
|
||||||
|
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\IAmTheMainCharacter_comments.zst",
|
||||||
|
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\BrightonHoveAlbion_comments.zst",
|
||||||
]
|
]
|
||||||
ignored_users = ['[deleted]', 'automoderator']
|
ignored_users = {'[deleted]', 'automoderator'}
|
||||||
|
ignored_users_file = "ignored.txt"
|
||||||
min_comments_per_sub = 1
|
min_comments_per_sub = 1
|
||||||
file_name = "users.txt"
|
file_name = "users.txt"
|
||||||
require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
|
require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
|
||||||
|
@ -64,6 +67,11 @@ def read_lines_zst(file_name):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
if os.path.exists(ignored_users_file):
|
||||||
|
with open(ignored_users_file) as fh:
|
||||||
|
for user in fh.readlines():
|
||||||
|
ignored_users.add(user.strip().lower())
|
||||||
|
|
||||||
commenterSubreddits = defaultdict(int)
|
commenterSubreddits = defaultdict(int)
|
||||||
is_first = True
|
is_first = True
|
||||||
total_lines = 0
|
total_lines = 0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue