mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-23 14:50:35 -04:00
Add compressionn level
This commit is contained in:
parent
b92bab7f1a
commit
4ecf22aaee
2 changed files with 9 additions and 6 deletions
|
@ -131,15 +131,18 @@ if __name__ == "__main__":
|
||||||
parser.add_argument('--input', help='Input folder', required=True)
|
parser.add_argument('--input', help='Input folder', required=True)
|
||||||
parser.add_argument('--output', help='Output folder', required=True)
|
parser.add_argument('--output', help='Output folder', required=True)
|
||||||
parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
|
parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
|
||||||
|
parser.add_argument("--level", help="The compression ratio to output at", default="3")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.debug:
|
if args.debug:
|
||||||
discord_logging.set_level(logging.DEBUG)
|
discord_logging.set_level(logging.DEBUG)
|
||||||
|
|
||||||
month = datetime.strptime(args.month, '%y-%m')
|
month = datetime.strptime(args.month, '%y-%m')
|
||||||
|
level = int(args.level)
|
||||||
|
|
||||||
log.info(f"Input folder: {args.input}")
|
log.info(f"Input folder: {args.input}")
|
||||||
log.info(f"Output folder: {args.output}")
|
log.info(f"Output folder: {args.output}")
|
||||||
|
log.info(f"Compression level: {level}")
|
||||||
|
|
||||||
prefix = None
|
prefix = None
|
||||||
if args.type == "comments":
|
if args.type == "comments":
|
||||||
|
@ -151,7 +154,7 @@ if __name__ == "__main__":
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
output_path = os.path.join(args.output, args.type, f"{prefix}_{month.strftime('%Y-%m')}.zst")
|
output_path = os.path.join(args.output, args.type, f"{prefix}_{month.strftime('%Y-%m')}.zst")
|
||||||
output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
|
output_handle = zstandard.ZstdCompressor(level=level).stream_writer(open(output_path, 'wb'))
|
||||||
|
|
||||||
count_objects = 0
|
count_objects = 0
|
||||||
minute_iterator = month
|
minute_iterator = month
|
||||||
|
@ -159,14 +162,14 @@ if __name__ == "__main__":
|
||||||
while minute_iterator < end_time:
|
while minute_iterator < end_time:
|
||||||
minute_file_path = os.path.join(args.input, args.type, minute_iterator.strftime('%y-%m-%d'), f"{prefix}_{minute_iterator.strftime('%y-%m-%d_%H-%M')}.zst")
|
minute_file_path = os.path.join(args.input, args.type, minute_iterator.strftime('%y-%m-%d'), f"{prefix}_{minute_iterator.strftime('%y-%m-%d_%H-%M')}.zst")
|
||||||
for obj, line, _ in utils.read_obj_zst_meta(minute_file_path):
|
for obj, line, _ in utils.read_obj_zst_meta(minute_file_path):
|
||||||
output_handle.write(line)
|
output_handle.write(line.encode('utf-8'))
|
||||||
output_handle.write(NEWLINE_ENCODED)
|
output_handle.write(NEWLINE_ENCODED)
|
||||||
|
|
||||||
count_objects += 1
|
count_objects += 1
|
||||||
if count_objects % 100000 == 0:
|
if count_objects % 100000 == 0:
|
||||||
log.info(f"{minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects}")
|
log.info(f"{minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects:,}")
|
||||||
|
|
||||||
minute_iterator += timedelta(minutes=1)
|
minute_iterator += timedelta(minutes=1)
|
||||||
|
|
||||||
log.info(f"{minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects}")
|
log.info(f"{minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects:,}")
|
||||||
output_handle.close()
|
output_handle.close()
|
||||||
|
|
|
@ -7,7 +7,7 @@ log = discord_logging.init_logging()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parent_folder = r"\\MYCLOUDPR4100\Public\ingest\combined\submissions"
|
parent_folder = r"\\MYCLOUDPR4100\Public\ingest\combined\comments"
|
||||||
files = []
|
files = []
|
||||||
for folder_name in os.listdir(parent_folder):
|
for folder_name in os.listdir(parent_folder):
|
||||||
folder = os.path.join(parent_folder, folder_name)
|
folder = os.path.join(parent_folder, folder_name)
|
||||||
|
@ -20,7 +20,7 @@ if __name__ == "__main__":
|
||||||
count_moved = 0
|
count_moved = 0
|
||||||
for folder, old_file in files:
|
for folder, old_file in files:
|
||||||
old_path = os.path.join(folder, old_file)
|
old_path = os.path.join(folder, old_file)
|
||||||
new_file = old_file.replace("RC_", "RS_")
|
new_file = old_file.replace("RS_", "RC_")
|
||||||
new_path = os.path.join(folder, new_file)
|
new_path = os.path.join(folder, new_file)
|
||||||
|
|
||||||
os.rename(old_path, new_path)
|
os.rename(old_path, new_path)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue