diff --git a/personal/diagnostic/test_file.py b/personal/diagnostic/test_file.py index 923ce9e..abb178c 100644 --- a/personal/diagnostic/test_file.py +++ b/personal/diagnostic/test_file.py @@ -8,7 +8,7 @@ log = discord_logging.init_logging() if __name__ == "__main__": - input_path = r"\\MYCLOUDPR4100\Public\reddit\comments\RC_2023-09.zst" + input_path = r"\\MYCLOUDPR4100\Public\reddit\submissions\RS_2023-04.zst" input_file_paths = [] if os.path.isdir(input_path): diff --git a/scripts/filter_file.py b/scripts/filter_file.py index e1ee4e4..f03d215 100644 --- a/scripts/filter_file.py +++ b/scripts/filter_file.py @@ -7,7 +7,7 @@ from datetime import datetime import logging.handlers # put the path to the input file, or a folder of files to process all of -input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits/CryptoCurrency_submissions.zst" +input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits\CryptoCurrency_submissions.zst" # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well output_file = r"\\MYCLOUDPR4100\Public\output" # the format to output in, pick from the following options diff --git a/scripts/find_overlapping_users.py b/scripts/find_overlapping_users.py index 5e472da..9dde855 100644 --- a/scripts/find_overlapping_users.py +++ b/scripts/find_overlapping_users.py @@ -7,8 +7,8 @@ import zstandard import json input_files = [ - r"\\MYCLOUDPR4100\Public\reddit\subreddits\srilanka_comments.zst", - r"\\MYCLOUDPR4100\Public\reddit\subreddits\Warthunder_comments.zst", + r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Chattanooga_comments.zst", + r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Graffiti_comments.zst", ] ignored_users = ['[deleted]', 'automoderator'] min_comments_per_sub = 1 diff --git a/scripts/to_csv.py b/scripts/to_csv.py index e7431ec..f4d561a 100644 --- a/scripts/to_csv.py +++ b/scripts/to_csv.py @@ -16,6 +16,13 @@ from datetime import datetime import logging.handlers +# put the path to the input file +input_file_path = r"\\MYCLOUDPR4100\Public\reddit\subreddits\CryptoCurrency_submissions.zst" +# put the path to the output file, with the csv extension +output_file_path = r"\\MYCLOUDPR4100\Public\CryptoCurrency_submissions.csv" +# if you want a custom set of fields, put them in the following list. If you leave it empty the script will use a default set of fields +fields = [] + log = logging.getLogger("bot") log.setLevel(logging.DEBUG) log.addHandler(logging.StreamHandler()) @@ -52,16 +59,21 @@ def read_lines_zst(file_name): if __name__ == "__main__": - input_file_path = sys.argv[1] - output_file_path = sys.argv[2] - fields = sys.argv[3].split(",") + if len(sys.argv) >= 3: + input_file_path = sys.argv[1] + output_file_path = sys.argv[2] + fields = sys.argv[3].split(",") + + is_submission = "submission" in input_file_path + if not len(fields): + if is_submission: + fields = ["author","title","score","created","link","text","url"] + else: + fields = ["author","score","created","link","body"] file_size = os.stat(input_file_path).st_size - file_lines = 0 - file_bytes_processed = 0 - line = None - created = None - bad_lines = 0 + file_lines, bad_lines = 0, 0 + line, created = None, None output_file = open(output_file_path, "w", encoding='utf-8', newline="") writer = csv.writer(output_file) writer.writerow(fields) @@ -71,7 +83,21 @@ if __name__ == "__main__": obj = json.loads(line) output_obj = [] for field in fields: - output_obj.append(str(obj[field]).encode("utf-8", errors='replace').decode()) + if field == "created": + value = datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d %H:%M") + elif field == "link": + value = f"https://www.reddit.com{obj['permalink']}" + elif field == "author": + value = f"u/{obj['author']}" + elif field == "text": + if 'selftext' in obj: + value = obj['selftext'] + else: + value = "" + else: + value = obj[field] + + output_obj.append(str(value).encode("utf-8", errors='replace').decode()) writer.writerow(output_obj) created = datetime.utcfromtimestamp(int(obj['created_utc']))