Initial implentation of process_month.py

2025-07-23 06:40:47 -04:00 · 2024-05-17 21:35:00 -07:00 · 2024-05-17 21:35:00 -07:00 · fa5f6316fb
commit fa5f6316fb
parent b54a2483dc
10 changed files with 703 additions and 369 deletions
--- a/scripts/filter_file.py
+++ b/scripts/filter_file.py
@ -7,7 +7,7 @@ from datetime import datetime
 import logging.handlers

 # put the path to the input file, or a folder of files to process all of
-input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments.zst"
+input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst"
 # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
 output_file = r"\\MYCLOUDPR4100\Public\output"
 # the format to output in, pick from the following options
@ -29,7 +29,7 @@ write_bad_lines = True

 # only output items between these two dates
 from_date = datetime.strptime("2005-01-01", "%Y-%m-%d")
-to_date = datetime.strptime("2025-12-31", "%Y-%m-%d")
+to_date = datetime.strptime("2030-12-31", "%Y-%m-%d")

 # the field to filter on, the values to filter with and whether it should be an exact match
 # some examples:
@ -75,6 +75,7 @@ to_date = datetime.strptime("2025-12-31", "%Y-%m-%d")
 # run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above
 # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"

+# change this to field = None if you don't want to filter by anything
 field = "body"
 values = ['']
 # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
--- a/scripts/find_overlapping_users.py
+++ b/scripts/find_overlapping_users.py
@ -14,6 +14,13 @@ import json
 # the script will look for both comments and submissions files for each subreddit
 folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23"
 subreddits = [
+	"aquarium",
+	"opiates",
+	"axolotls",
+	"piercing",
+	"titanfolk",
+	"AskOuija",
+	"piercing",
 	"DPH",
 	"dxm",
 ]
--- a/scripts/ignored.txt
+++ b/scripts/ignored.txt
@ -122,4 +122,10 @@ same_subreddit_bot
 SuicideAwarenessBot
 thebenshapirobot
 these_days_bot
-totes_meta_bot
+totes_meta_bot
+aardBot
+gifv-bot
+I_Love_You-BOT
+imdad_bot
+metric_units
+YoUaReSoHiLaRiOuS
--- a/scripts/to_csv.py
+++ b/scripts/to_csv.py
@ -17,9 +17,9 @@ import logging.handlers


 # put the path to the input file
-input_file_path = r"\\MYCLOUDPR4100\Public\reddit\subreddits\intel_comments.zst"
+input_file_path = r"\\MYCLOUDPR4100\Public\tools\PushshiftDumps\Straight-Wrap-172_submissions.zst"
 # put the path to the output file, with the csv extension
-output_file_path = r"\\MYCLOUDPR4100\Public\intel_comments.csv"
+output_file_path = r"\\MYCLOUDPR4100\Public\Straight-Wrap-172_submissions.csv"
 # if you want a custom set of fields, put them in the following list. If you leave it empty the script will use a default set of fields
 fields = []