mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 23:55:18 -04:00
Add from/to
This commit is contained in:
parent
3984f0ecfc
commit
15d574e198
3 changed files with 75 additions and 47 deletions
|
@ -139,6 +139,51 @@ def process_file(file, queue):
|
|||
queue.put(file)
|
||||
|
||||
|
||||
def process_update(input_files, queue, last_log_time, force_write):
|
||||
file_update = queue.get()
|
||||
if file_update.error_message is not None:
|
||||
log.warning(f"File failed {file_update.input_path}: {file_update.error_message}")
|
||||
current_time = time.time()
|
||||
|
||||
input_files[file_update.input_path] = file_update
|
||||
if force_write or last_log_time is None or (current_time - last_log_time) > 5 or queue.empty():
|
||||
total_lines_processed = 0
|
||||
total_bytes_processed = 0
|
||||
total_lines_errored = 0
|
||||
files_processed = 0
|
||||
files_errored = 0
|
||||
i = 0
|
||||
for file in input_files.values():
|
||||
total_lines_processed += file.lines_processed
|
||||
total_bytes_processed += file.bytes_processed
|
||||
total_lines_errored += file.error_lines
|
||||
files_processed += 1 if file.complete or file.error_message is not None else 0
|
||||
files_errored += 1 if file.error_message is not None else 0
|
||||
i += 1
|
||||
if file_update.complete or file_update.error_message is not None:
|
||||
save_file_list(input_files, status_json, script_type)
|
||||
progress_queue.put([current_time, total_lines_processed, total_bytes_processed])
|
||||
|
||||
first_time, first_lines, first_bytes = progress_queue.peek()
|
||||
bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time))
|
||||
speed_queue.put(bytes_per_second)
|
||||
seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list)))
|
||||
minutes_left = int(seconds_left / 60)
|
||||
hours_left = int(minutes_left / 60)
|
||||
days_left = int(hours_left / 24)
|
||||
|
||||
log.info(
|
||||
f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored : "
|
||||
f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : "
|
||||
f"{files_processed}({files_errored})/{len(input_files)} files : "
|
||||
f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining : "
|
||||
f"{queue.qsize()} files in queue : {current_time} : {last_log_time} : {current_time - last_log_time if last_log_time is not None else 0} : "
|
||||
f"{(current_time - last_log_time) > 5 if last_log_time is not None else 0} : {int(current_time - last_log_time) > 5 if last_log_time is not None else 0} : "
|
||||
f"{last_log_time is None or (current_time - last_log_time) > 5 or queue.empty()} : {queue.empty()}")
|
||||
last_log_time = time.time()
|
||||
return last_log_time
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Use multiple processes to decompress and iterate over pushshift dump files")
|
||||
parser.add_argument("input", help="The input folder to recursively read files from")
|
||||
|
@ -209,47 +254,10 @@ if __name__ == '__main__':
|
|||
workers = pool.starmap_async(process_file, [(file, queue) for file in files_to_process], chunksize=1, error_callback=log.info)
|
||||
while not workers.ready():
|
||||
# loop until the workers are all done, pulling in status messages as they are sent
|
||||
file_update = queue.get()
|
||||
if file_update.error_message is not None:
|
||||
log.warning(f"File failed {file_update.input_path}: {file_update.error_message}")
|
||||
current_time = time.time()
|
||||
last_log_time = process_update(input_files, queue, last_log_time, False)
|
||||
|
||||
input_files[file_update.input_path] = file_update
|
||||
if last_log_time is None or (current_time - last_log_time) > 5 or queue.empty():
|
||||
total_lines_processed = 0
|
||||
total_bytes_processed = 0
|
||||
total_lines_errored = 0
|
||||
files_processed = 0
|
||||
files_errored = 0
|
||||
i = 0
|
||||
for file in input_files.values():
|
||||
total_lines_processed += file.lines_processed
|
||||
total_bytes_processed += file.bytes_processed
|
||||
total_lines_errored += file.error_lines
|
||||
files_processed += 1 if file.complete or file.error_message is not None else 0
|
||||
files_errored += 1 if file.error_message is not None else 0
|
||||
i += 1
|
||||
if file_update.complete or file_update.error_message is not None:
|
||||
save_file_list(input_files, status_json, script_type)
|
||||
progress_queue.put([current_time, total_lines_processed, total_bytes_processed])
|
||||
while not queue.empty():
|
||||
|
||||
first_time, first_lines, first_bytes = progress_queue.peek()
|
||||
bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time))
|
||||
speed_queue.put(bytes_per_second)
|
||||
seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list)))
|
||||
minutes_left = int(seconds_left / 60)
|
||||
hours_left = int(minutes_left / 60)
|
||||
days_left = int(hours_left / 24)
|
||||
|
||||
log.info(
|
||||
f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored : "
|
||||
f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : "
|
||||
f"{files_processed}({files_errored})/{len(input_files)} files : "
|
||||
f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining : "
|
||||
f"{queue.qsize()} files in queue : {current_time} : {last_log_time} : {current_time - last_log_time if last_log_time is not None else 0} : "
|
||||
f"{(current_time - last_log_time) > 5 if last_log_time is not None else 0} : {int(current_time - last_log_time) > 5 if last_log_time is not None else 0} : "
|
||||
f"{last_log_time is None or (current_time - last_log_time) > 5 or queue.empty()} : {queue.empty()}")
|
||||
last_log_time = time.time()
|
||||
|
||||
log.info(f"{total_lines_processed:,}, {total_lines_errored} errored : {(total_bytes_processed / (2**30)):.2f} gb, {(total_bytes_processed / total_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}")
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue