Initial implentation of process_month.py

This commit is contained in:
Watchful1 2024-05-17 21:35:00 -07:00
parent b54a2483dc
commit fa5f6316fb
10 changed files with 703 additions and 369 deletions

View file

@ -7,7 +7,7 @@ from datetime import datetime
import logging.handlers
# put the path to the input file, or a folder of files to process all of
input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments.zst"
input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst"
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
output_file = r"\\MYCLOUDPR4100\Public\output"
# the format to output in, pick from the following options
@ -29,7 +29,7 @@ write_bad_lines = True
# only output items between these two dates
from_date = datetime.strptime("2005-01-01", "%Y-%m-%d")
to_date = datetime.strptime("2025-12-31", "%Y-%m-%d")
to_date = datetime.strptime("2030-12-31", "%Y-%m-%d")
# the field to filter on, the values to filter with and whether it should be an exact match
# some examples:
@ -75,6 +75,7 @@ to_date = datetime.strptime("2025-12-31", "%Y-%m-%d")
# run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
# change this to field = None if you don't want to filter by anything
field = "body"
values = ['']
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above

View file

@ -14,6 +14,13 @@ import json
# the script will look for both comments and submissions files for each subreddit
folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23"
subreddits = [
"aquarium",
"opiates",
"axolotls",
"piercing",
"titanfolk",
"AskOuija",
"piercing",
"DPH",
"dxm",
]

View file

@ -122,4 +122,10 @@ same_subreddit_bot
SuicideAwarenessBot
thebenshapirobot
these_days_bot
totes_meta_bot
totes_meta_bot
aardBot
gifv-bot
I_Love_You-BOT
imdad_bot
metric_units
YoUaReSoHiLaRiOuS

View file

@ -17,9 +17,9 @@ import logging.handlers
# put the path to the input file
input_file_path = r"\\MYCLOUDPR4100\Public\reddit\subreddits\intel_comments.zst"
input_file_path = r"\\MYCLOUDPR4100\Public\tools\PushshiftDumps\Straight-Wrap-172_submissions.zst"
# put the path to the output file, with the csv extension
output_file_path = r"\\MYCLOUDPR4100\Public\intel_comments.csv"
output_file_path = r"\\MYCLOUDPR4100\Public\Straight-Wrap-172_submissions.csv"
# if you want a custom set of fields, put them in the following list. If you leave it empty the script will use a default set of fields
fields = []