Add split blocks by minute

2025-12-17 09:14:04 -05:00 · 2023-11-02 21:04:03 -07:00 · 2023-11-02 21:04:03 -07:00 · f35762e203
commit f35762e203
parent 4ecf22aaee
7 changed files with 318 additions and 7 deletions
--- a/personal/combine/build_month.py
+++ b/personal/combine/build_month.py
@ -142,6 +142,7 @@ if __name__ == "__main__":
 	log.info(f"Input folder: {args.input}")
 	log.info(f"Output folder: {args.output}")
 	log.info(f"Month: {args.month}")
 	log.info(f"Compression level: {level}")
 	prefix = None
--- a/personal/diagnostic/test_file.py
+++ b/personal/diagnostic/test_file.py
@ -8,7 +8,7 @@ log = discord_logging.init_logging()
 if __name__ == "__main__":
-	input_path = r"\\MYCLOUDPR4100\Public\ingest\combined\comments\RC_23-07-10.zst"
+	input_path = r"\\MYCLOUDPR4100\Public\reddit\comments\RC_2023-09.zst"
 	input_file_paths = []
 	if os.path.isdir(input_path):
@ -32,7 +32,7 @@ if __name__ == "__main__":
 		for obj, line, file_bytes_processed in utils.read_obj_zst_meta(file_path):
 			new_timestamp = int(obj['created_utc'])
 			created = datetime.utcfromtimestamp(new_timestamp)
-			if previous_timestamp is not None and previous_timestamp - (60 * 60) > new_timestamp:
+			if previous_timestamp is not None and previous_timestamp - (2) > new_timestamp:
 				log.warning(f"Out of order timestamps {datetime.utcfromtimestamp(previous_timestamp).strftime('%Y-%m-%d %H:%M:%S')} - 4 hours > {created.strftime('%Y-%m-%d %H:%M:%S')}")
 			previous_timestamp = new_timestamp
 			file_lines += 1
--- a/personal/transform/split_blocks_by_minutes.py
+++ b/personal/transform/split_blocks_by_minutes.py
@ -0,0 +1,46 @@
 import discord_logging
 import os
 import zstandard
 from datetime import datetime
 import json
 log = discord_logging.init_logging()
 import utils
 NEWLINE_ENCODED = "\n".encode('utf-8')
 if __name__ == "__main__":
 	input_file = r"\\MYCLOUDPR4100\Public\reddit\blocks\RS_2023-10.zst_blocks"
 	output_folder = r"\\MYCLOUDPR4100\Public\ingest\download"
 	file_type = "comments" if "RC" in input_file else "submissions"
 	log.info(f"Input: {input_file} - Output: {output_folder}")
 	previous_minute, output_handle, created_utc = None, None, None
 	count_objects, count_minute = 0, 0
 	for obj in utils.read_obj_zst_blocks(input_file):
 		created_utc = datetime.utcfromtimestamp(obj["created_utc"])
 		current_minute = created_utc.replace(second=0)
 		if previous_minute is None or current_minute > previous_minute:
 			log.info(f"{created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}")
 			previous_minute = current_minute
 			count_minute = 0
 			if output_handle is not None:
 				output_handle.close()
 			output_path = os.path.join(output_folder, file_type, created_utc.strftime('%y-%m-%d'))
 			if not os.path.exists(output_path):
 				os.makedirs(output_path)
 			output_path = os.path.join(output_path, f"{('RC' if file_type == 'comments' else 'RS')}_{created_utc.strftime('%y-%m-%d_%H-%M')}.zst")
 			output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
 		count_objects += 1
 		count_minute += 1
 		output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8'))
 		output_handle.write(NEWLINE_ENCODED)
 	log.info(f"{created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}")
 	if output_handle is not None:
 		output_handle.close()
--- a/personal/transform/split_by_minutes.py
+++ b/personal/transform/split_by_minutes.py
@ -12,7 +12,7 @@ NEWLINE_ENCODED = "\n".encode('utf-8')
 if __name__ == "__main__":
-	input_file = r"\\MYCLOUDPR4100\Public\RC_2023-08.zst"
+	input_file = r"\\MYCLOUDPR4100\Public\RS_2023-09.zst"
 	output_folder = r"\\MYCLOUDPR4100\Public\ingest\download"
 	file_type = "comments" if "RC" in input_file else "submissions"
--- a/personal/utils.py
+++ b/personal/utils.py
@ -1,5 +1,7 @@
 import zstandard
 import json
 import os
 from zst_blocks import ZstBlocksFile
 def read_obj_zst(file_name):
@ -75,6 +77,14 @@ class OutputZst:
 		return True
 # copied from https://github.com/ArthurHeitmann/zst_blocks_format
 def read_obj_zst_blocks(file_name):
 	with open(file_name, "rb") as file:
 		for row in ZstBlocksFile.streamRows(file):
 			line = row.decode()
 			yield json.loads(line.strip())
 def base36encode(integer: int) -> str:
 	chars = '0123456789abcdefghijklmnopqrstuvwxyz'
 	sign = '-' if integer < 0 else ''
--- a/personal/zst_blocks.py
+++ b/personal/zst_blocks.py
@ -0,0 +1,254 @@
 # copied from https://github.com/ArthurHeitmann/zst_blocks_format
 from __future__ import annotations
 from dataclasses import dataclass
 import os
 import time
 import struct
 from typing import BinaryIO, Callable, Iterable, Literal
 from zstandard import ZstdDecompressor, ZstdCompressor
 _endian: Literal["little", "big"] = "little"
 _uint32Struct = struct.Struct("<I")
 _uint32X2Struct = struct.Struct("<II")
 _defaultCompressionLevel = 3
 class ZstBlocksFile:
 	blocks: list[ZstBlock]
 	def __init__(self, blocks: list[ZstBlock]):
 		self.blocks = blocks
 	@staticmethod
 	def readBlockRowAt(file: BinaryIO, rowPosition: RowPosition) -> bytes:
 		file.seek(rowPosition.blockOffset)
 		return ZstBlock.readRow(file, rowPosition.rowIndex)
 	@staticmethod
 	def readMultipleBlocks(file: BinaryIO, rowPositions: list[RowPosition]) -> \
 	list[bytes]:
 		blockGroupsDict: dict[int, RowPositionGroup] = {}
 		for i, rowPosition in enumerate(rowPositions):
 			if rowPosition.blockOffset not in blockGroupsDict:
 				blockGroupsDict[rowPosition.blockOffset] = RowPositionGroup(
 					rowPosition.blockOffset, [])
 			blockGroupsDict[rowPosition.blockOffset].rowIndices.append(
 				RowIndex(rowPosition.rowIndex, i))
 		blockGroups = list(blockGroupsDict.values())
 		rows: list = [None] * len(rowPositions)
 		for blockGroup in blockGroups:
 			file.seek(blockGroup.blockOffset)
 			blockRows = ZstBlock.readSpecificRows(file, map(lambda
 																pair: pair.withinBlockIndex,
 															blockGroup.rowIndices))
 			for originalPosition, row in zip(blockGroup.rowIndices, blockRows):
 				rows[originalPosition.originalRowIndex] = row
 		return rows
 	@staticmethod
 	def streamRows(file: BinaryIO, blockIndexProgressCallback: Callable[[
 		int], None] | None = None) -> Iterable[bytes]:
 		fileSize = os.path.getsize(file.name)
 		blockIndex = 0
 		while file.tell() < fileSize:
 			yield from ZstBlock.streamRows(file)
 			blockIndex += 1
 			if blockIndexProgressCallback is not None:
 				blockIndexProgressCallback(blockIndex)
 	@staticmethod
 	def appendBlock(file: BinaryIO, rows: list[bytes],
 					compressionLevel=_defaultCompressionLevel) -> None:
 		file.seek(file.tell())
 		ZstBlock(rows).write(file, compressionLevel=compressionLevel)
 	@staticmethod
 	def writeStream(file: BinaryIO, rowStream: Iterable[bytes], blockSize: int,
 					rowPositions: list[RowPosition] | None = None,
 					compressionLevel=_defaultCompressionLevel) -> None:
 		pendingRows = []
 		for row in rowStream:
 			pendingRows.append(row)
 			if len(pendingRows) >= blockSize:
 				ZstBlock(pendingRows).write(file, rowPositions,
 											compressionLevel=compressionLevel)
 				pendingRows = []
 		if len(pendingRows) > 0:
 			ZstBlock(pendingRows).write(file, rowPositions,
 										compressionLevel=compressionLevel)
 	@staticmethod
 	def writeBlocksStream(file: BinaryIO, blocksStream: Iterable[list[bytes]],
 						  rowPositions: list[RowPosition] | None = None,
 						  compressionLevel=_defaultCompressionLevel) -> None:
 		for rows in blocksStream:
 			ZstBlock(rows).write(file, rowPositions,
 								 compressionLevel=compressionLevel)
 	@staticmethod
 	def countBlocks(file: BinaryIO) -> int:
 		fileSize = os.path.getsize(file.name)
 		blockCount = 0
 		initialPos = file.tell()
 		pos = initialPos
 		while pos < fileSize:
 			blockCount += 1
 			blockSize = _uint32Struct.unpack(file.read(4))[0]
 			pos += 4 + blockSize
 			file.seek(pos)
 		file.seek(initialPos)
 		return blockCount
 	@staticmethod
 	def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]:
 		fileSize = os.path.getsize(file.name)
 		while file.tell() < fileSize:
 			yield from ZstBlock.generateRowPositions(file)
 class ZstBlock:
 	rows: list[bytes]
 	def __init__(self, rows: list[bytes]):
 		self.rows = rows
 	@classmethod
 	def streamRows(cls, file: BinaryIO) -> Iterable[bytes]:
 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
 		compressedData = file.read(compressedSize)
 		decompressedData = ZstdDecompressor().decompress(compressedData)
 		memoryView = memoryview(decompressedData)
 		count = _uint32Struct.unpack(memoryView[0:4])[0]
 		rows: list[ZstRowInfo] = [None] * count
 		for i in range(count):
 			rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize)
 		dataStart = 4 + count * ZstRowInfo.structSize
 		for row in rows:
 			yield decompressedData[
 				  dataStart + row.offset: dataStart + row.offset + row.size]
 	@classmethod
 	def readSpecificRows(cls, file: BinaryIO, rowIndices: Iterable[int]) -> \
 	list[bytes]:
 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
 		compressedData = file.read(compressedSize)
 		decompressedData = ZstdDecompressor().decompress(compressedData)
 		memoryView = memoryview(decompressedData)
 		count = _uint32Struct.unpack(memoryView[0:4])[0]
 		rows: list[ZstRowInfo] = [None] * count
 		for i in range(count):
 			rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize)
 		dataStart = 4 + count * ZstRowInfo.structSize
 		return [
 			decompressedData[
 			dataStart + rows[rowIndex].offset: dataStart + rows[
 				rowIndex].offset + rows[rowIndex].size]
 			for rowIndex in rowIndices
 		]
 	@classmethod
 	def readRow(cls, file: BinaryIO, rowIndex: int) -> bytes:
 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
 		compressedData = file.read(compressedSize)
 		decompressedData = ZstdDecompressor().decompress(compressedData)
 		memoryView = memoryview(decompressedData)
 		count = _uint32Struct.unpack(memoryView[0:4])[0]
 		if rowIndex >= count:
 			raise Exception("Row index out of range")
 		row = ZstRowInfo.read(memoryView, 4 + rowIndex * ZstRowInfo.structSize)
 		dataStart = 4 + count * ZstRowInfo.structSize
 		return decompressedData[
 			   dataStart + row.offset: dataStart + row.offset + row.size]
 	def write(self, file: BinaryIO,
 			  rowPositions: list[RowPosition] | None = None,
 			  compressionLevel=_defaultCompressionLevel) -> None:
 		uncompressedSize = \
 			4 + \
 			len(self.rows) * ZstRowInfo.structSize + \
 			sum(len(row) for row in self.rows)
 		uncompressedBytes = bytearray(uncompressedSize)
 		uncompressedBytes[0:4] = len(self.rows).to_bytes(4, _endian)
 		dataOffset = 4 + len(self.rows) * ZstRowInfo.structSize
 		blockOffset = file.tell()
 		currentDataLocalOffset = 0
 		for i in range(len(self.rows)):
 			row = self.rows[i]
 			rowInfo = ZstRowInfo(currentDataLocalOffset, len(row))
 			rowInfo.write(uncompressedBytes, 4 + i * ZstRowInfo.structSize)
 			uncompressedBytes[
 			dataOffset + currentDataLocalOffset: dataOffset + currentDataLocalOffset + len(
 				row)] = row
 			currentDataLocalOffset += len(row)
 			if rowPositions is not None:
 				rowPositions.append(RowPosition(blockOffset, i))
 		uncompressedData = bytes(uncompressedBytes)
 		compressedData = ZstdCompressor(compressionLevel).compress(
 			uncompressedData)
 		compressedSize = len(compressedData)
 		blockBytes = bytearray(4 + compressedSize)
 		blockBytes[0:4] = compressedSize.to_bytes(4, _endian)
 		blockBytes[4:4 + compressedSize] = compressedData
 		file.write(blockBytes)
 	@staticmethod
 	def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]:
 		blockOffset = file.tell()
 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
 		compressedData = file.read(compressedSize)
 		decompressedData = ZstdDecompressor().decompress(compressedData)
 		memoryView = memoryview(decompressedData)
 		count = _uint32Struct.unpack(memoryView[0:4])[0]
 		for i in range(count):
 			yield RowPosition(blockOffset, i)
 class ZstRowInfo:
 	structSize = 8
 	offset: int
 	size: int
 	def __init__(self, offset: int, size: int):
 		self.offset = offset
 		self.size = size
 	@staticmethod
 	def read(bytes: bytes, position: int) -> ZstRowInfo:
 		offset, size = _uint32X2Struct.unpack(
 			bytes[position: position + ZstRowInfo.structSize])
 		return ZstRowInfo(offset, size)
 	def write(self, bytes: bytearray, position: int) -> None:
 		bytes[position + 0: position + 4] = self.offset.to_bytes(4, _endian)
 		bytes[position + 4: position + 8] = self.size.to_bytes(4, _endian)
@dataclass
 class RowPosition:
 	blockOffset: int
 	rowIndex: int
@dataclass
 class RowIndex:
 	withinBlockIndex: int
 	originalRowIndex: int
@dataclass
 class RowPositionGroup:
 	blockOffset: int
 	rowIndices: list[RowIndex]
--- a/scripts/filter_file.py
+++ b/scripts/filter_file.py
@ -7,7 +7,7 @@ from datetime import datetime
 import logging.handlers
 # put the path to the input file, or a folder of files to process all of
-input_file = r"\\MYCLOUDPR4100\Public\reddit_test"
+input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits/CryptoCurrency_submissions.zst"
 # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
 output_file = r"\\MYCLOUDPR4100\Public\output"
 # the format to output in, pick from the following options
@ -28,8 +28,8 @@ single_field = None
 write_bad_lines = True
 # only output items between these two dates
-from_date = datetime.strptime("2005-01-01", "%Y-%m-%d")
+from_date = datetime.strptime("2022-01-01", "%Y-%m-%d")
-to_date = datetime.strptime("2025-01-01", "%Y-%m-%d")
+to_date = datetime.strptime("2022-12-31", "%Y-%m-%d")
 # the field to filter on, the values to filter with and whether it should be an exact match
 # some examples:
@ -76,7 +76,7 @@ to_date = datetime.strptime("2025-01-01", "%Y-%m-%d")
 # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
 field = "title"
-values = ['post race discussion']
+values = ['']
 # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
 # if this list is very large, it could greatly slow down the process
 values_file = None