mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 23:55:18 -04:00
More fixes
This commit is contained in:
parent
021d033732
commit
4501ec236f
1 changed files with 6 additions and 3 deletions
|
@ -202,6 +202,7 @@ if __name__ == '__main__':
|
||||||
total_bytes = 0
|
total_bytes = 0
|
||||||
total_bytes_processed = 0
|
total_bytes_processed = 0
|
||||||
total_lines_processed = 0
|
total_lines_processed = 0
|
||||||
|
total_lines_errored = 0
|
||||||
files_to_process = []
|
files_to_process = []
|
||||||
# calculate the total file size for progress reports, build a list of incomplete files to process
|
# calculate the total file size for progress reports, build a list of incomplete files to process
|
||||||
for file in input_files:
|
for file in input_files:
|
||||||
|
@ -210,6 +211,7 @@ if __name__ == '__main__':
|
||||||
files_processed += 1
|
files_processed += 1
|
||||||
total_lines_processed += file.lines_processed
|
total_lines_processed += file.lines_processed
|
||||||
total_bytes_processed += file.file_size
|
total_bytes_processed += file.file_size
|
||||||
|
total_lines_errored += file.error_lines
|
||||||
else:
|
else:
|
||||||
files_to_process.append(file)
|
files_to_process.append(file)
|
||||||
|
|
||||||
|
@ -297,15 +299,16 @@ if __name__ == '__main__':
|
||||||
output_lines = 0
|
output_lines = 0
|
||||||
output_file_path = os.path.join(args.output, args.name + ".zst")
|
output_file_path = os.path.join(args.output, args.name + ".zst")
|
||||||
# combine all the output files into the final results file
|
# combine all the output files into the final results file
|
||||||
with open(output_file_path, 'w') as output_file:
|
with open(output_file_path, 'wb') as output_file:
|
||||||
files_combined = 0
|
files_combined = 0
|
||||||
writer = zstandard.ZstdCompressor().stream_writer(output_file)
|
writer = zstandard.ZstdCompressor().stream_writer(output_file)
|
||||||
for working_file_path in working_file_paths:
|
for working_file_path in working_file_paths:
|
||||||
files_combined += 1
|
files_combined += 1
|
||||||
log.info(f"Reading {files_combined}/{len(working_file_paths)}")
|
log.info(f"Reading {files_combined}/{len(working_file_paths)}")
|
||||||
with open(working_file_path, 'r') as input_file:
|
with open(working_file_path, 'r') as input_file:
|
||||||
for line in input_file.readlines():
|
for line in input_file:
|
||||||
output_lines += 1
|
output_lines += 1
|
||||||
writer.write(line.encode('utf-8'))
|
encoded_line = line.encode('utf-8')
|
||||||
|
writer.write(encoded_line)
|
||||||
|
|
||||||
log.info(f"Finished combining files, {output_lines:,} lines written to {output_file_path}")
|
log.info(f"Finished combining files, {output_lines:,} lines written to {output_file_path}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue