mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-23 14:50:35 -04:00
Add split blocks by minute
This commit is contained in:
parent
4ecf22aaee
commit
f35762e203
7 changed files with 318 additions and 7 deletions
254
personal/zst_blocks.py
Normal file
254
personal/zst_blocks.py
Normal file
|
@ -0,0 +1,254 @@
|
|||
# copied from https://github.com/ArthurHeitmann/zst_blocks_format
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
import time
|
||||
import struct
|
||||
from typing import BinaryIO, Callable, Iterable, Literal
|
||||
from zstandard import ZstdDecompressor, ZstdCompressor
|
||||
|
||||
_endian: Literal["little", "big"] = "little"
|
||||
|
||||
_uint32Struct = struct.Struct("<I")
|
||||
_uint32X2Struct = struct.Struct("<II")
|
||||
|
||||
_defaultCompressionLevel = 3
|
||||
|
||||
|
||||
class ZstBlocksFile:
|
||||
blocks: list[ZstBlock]
|
||||
|
||||
def __init__(self, blocks: list[ZstBlock]):
|
||||
self.blocks = blocks
|
||||
|
||||
@staticmethod
|
||||
def readBlockRowAt(file: BinaryIO, rowPosition: RowPosition) -> bytes:
|
||||
file.seek(rowPosition.blockOffset)
|
||||
return ZstBlock.readRow(file, rowPosition.rowIndex)
|
||||
|
||||
@staticmethod
|
||||
def readMultipleBlocks(file: BinaryIO, rowPositions: list[RowPosition]) -> \
|
||||
list[bytes]:
|
||||
blockGroupsDict: dict[int, RowPositionGroup] = {}
|
||||
for i, rowPosition in enumerate(rowPositions):
|
||||
if rowPosition.blockOffset not in blockGroupsDict:
|
||||
blockGroupsDict[rowPosition.blockOffset] = RowPositionGroup(
|
||||
rowPosition.blockOffset, [])
|
||||
blockGroupsDict[rowPosition.blockOffset].rowIndices.append(
|
||||
RowIndex(rowPosition.rowIndex, i))
|
||||
blockGroups = list(blockGroupsDict.values())
|
||||
|
||||
rows: list = [None] * len(rowPositions)
|
||||
for blockGroup in blockGroups:
|
||||
file.seek(blockGroup.blockOffset)
|
||||
blockRows = ZstBlock.readSpecificRows(file, map(lambda
|
||||
pair: pair.withinBlockIndex,
|
||||
blockGroup.rowIndices))
|
||||
for originalPosition, row in zip(blockGroup.rowIndices, blockRows):
|
||||
rows[originalPosition.originalRowIndex] = row
|
||||
|
||||
return rows
|
||||
|
||||
@staticmethod
|
||||
def streamRows(file: BinaryIO, blockIndexProgressCallback: Callable[[
|
||||
int], None] | None = None) -> Iterable[bytes]:
|
||||
fileSize = os.path.getsize(file.name)
|
||||
blockIndex = 0
|
||||
while file.tell() < fileSize:
|
||||
yield from ZstBlock.streamRows(file)
|
||||
blockIndex += 1
|
||||
if blockIndexProgressCallback is not None:
|
||||
blockIndexProgressCallback(blockIndex)
|
||||
|
||||
@staticmethod
|
||||
def appendBlock(file: BinaryIO, rows: list[bytes],
|
||||
compressionLevel=_defaultCompressionLevel) -> None:
|
||||
file.seek(file.tell())
|
||||
ZstBlock(rows).write(file, compressionLevel=compressionLevel)
|
||||
|
||||
@staticmethod
|
||||
def writeStream(file: BinaryIO, rowStream: Iterable[bytes], blockSize: int,
|
||||
rowPositions: list[RowPosition] | None = None,
|
||||
compressionLevel=_defaultCompressionLevel) -> None:
|
||||
pendingRows = []
|
||||
for row in rowStream:
|
||||
pendingRows.append(row)
|
||||
if len(pendingRows) >= blockSize:
|
||||
ZstBlock(pendingRows).write(file, rowPositions,
|
||||
compressionLevel=compressionLevel)
|
||||
pendingRows = []
|
||||
if len(pendingRows) > 0:
|
||||
ZstBlock(pendingRows).write(file, rowPositions,
|
||||
compressionLevel=compressionLevel)
|
||||
|
||||
@staticmethod
|
||||
def writeBlocksStream(file: BinaryIO, blocksStream: Iterable[list[bytes]],
|
||||
rowPositions: list[RowPosition] | None = None,
|
||||
compressionLevel=_defaultCompressionLevel) -> None:
|
||||
for rows in blocksStream:
|
||||
ZstBlock(rows).write(file, rowPositions,
|
||||
compressionLevel=compressionLevel)
|
||||
|
||||
@staticmethod
|
||||
def countBlocks(file: BinaryIO) -> int:
|
||||
fileSize = os.path.getsize(file.name)
|
||||
blockCount = 0
|
||||
initialPos = file.tell()
|
||||
pos = initialPos
|
||||
while pos < fileSize:
|
||||
blockCount += 1
|
||||
blockSize = _uint32Struct.unpack(file.read(4))[0]
|
||||
pos += 4 + blockSize
|
||||
file.seek(pos)
|
||||
file.seek(initialPos)
|
||||
return blockCount
|
||||
|
||||
@staticmethod
|
||||
def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]:
|
||||
fileSize = os.path.getsize(file.name)
|
||||
while file.tell() < fileSize:
|
||||
yield from ZstBlock.generateRowPositions(file)
|
||||
|
||||
|
||||
class ZstBlock:
|
||||
rows: list[bytes]
|
||||
|
||||
def __init__(self, rows: list[bytes]):
|
||||
self.rows = rows
|
||||
|
||||
@classmethod
|
||||
def streamRows(cls, file: BinaryIO) -> Iterable[bytes]:
|
||||
compressedSize = _uint32Struct.unpack(file.read(4))[0]
|
||||
compressedData = file.read(compressedSize)
|
||||
decompressedData = ZstdDecompressor().decompress(compressedData)
|
||||
|
||||
memoryView = memoryview(decompressedData)
|
||||
count = _uint32Struct.unpack(memoryView[0:4])[0]
|
||||
rows: list[ZstRowInfo] = [None] * count
|
||||
for i in range(count):
|
||||
rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize)
|
||||
|
||||
dataStart = 4 + count * ZstRowInfo.structSize
|
||||
for row in rows:
|
||||
yield decompressedData[
|
||||
dataStart + row.offset: dataStart + row.offset + row.size]
|
||||
|
||||
@classmethod
|
||||
def readSpecificRows(cls, file: BinaryIO, rowIndices: Iterable[int]) -> \
|
||||
list[bytes]:
|
||||
compressedSize = _uint32Struct.unpack(file.read(4))[0]
|
||||
compressedData = file.read(compressedSize)
|
||||
decompressedData = ZstdDecompressor().decompress(compressedData)
|
||||
|
||||
memoryView = memoryview(decompressedData)
|
||||
count = _uint32Struct.unpack(memoryView[0:4])[0]
|
||||
rows: list[ZstRowInfo] = [None] * count
|
||||
for i in range(count):
|
||||
rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize)
|
||||
|
||||
dataStart = 4 + count * ZstRowInfo.structSize
|
||||
return [
|
||||
decompressedData[
|
||||
dataStart + rows[rowIndex].offset: dataStart + rows[
|
||||
rowIndex].offset + rows[rowIndex].size]
|
||||
for rowIndex in rowIndices
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def readRow(cls, file: BinaryIO, rowIndex: int) -> bytes:
|
||||
compressedSize = _uint32Struct.unpack(file.read(4))[0]
|
||||
compressedData = file.read(compressedSize)
|
||||
decompressedData = ZstdDecompressor().decompress(compressedData)
|
||||
|
||||
memoryView = memoryview(decompressedData)
|
||||
count = _uint32Struct.unpack(memoryView[0:4])[0]
|
||||
if rowIndex >= count:
|
||||
raise Exception("Row index out of range")
|
||||
row = ZstRowInfo.read(memoryView, 4 + rowIndex * ZstRowInfo.structSize)
|
||||
|
||||
dataStart = 4 + count * ZstRowInfo.structSize
|
||||
return decompressedData[
|
||||
dataStart + row.offset: dataStart + row.offset + row.size]
|
||||
|
||||
def write(self, file: BinaryIO,
|
||||
rowPositions: list[RowPosition] | None = None,
|
||||
compressionLevel=_defaultCompressionLevel) -> None:
|
||||
uncompressedSize = \
|
||||
4 + \
|
||||
len(self.rows) * ZstRowInfo.structSize + \
|
||||
sum(len(row) for row in self.rows)
|
||||
uncompressedBytes = bytearray(uncompressedSize)
|
||||
uncompressedBytes[0:4] = len(self.rows).to_bytes(4, _endian)
|
||||
|
||||
dataOffset = 4 + len(self.rows) * ZstRowInfo.structSize
|
||||
blockOffset = file.tell()
|
||||
currentDataLocalOffset = 0
|
||||
for i in range(len(self.rows)):
|
||||
row = self.rows[i]
|
||||
rowInfo = ZstRowInfo(currentDataLocalOffset, len(row))
|
||||
rowInfo.write(uncompressedBytes, 4 + i * ZstRowInfo.structSize)
|
||||
uncompressedBytes[
|
||||
dataOffset + currentDataLocalOffset: dataOffset + currentDataLocalOffset + len(
|
||||
row)] = row
|
||||
currentDataLocalOffset += len(row)
|
||||
if rowPositions is not None:
|
||||
rowPositions.append(RowPosition(blockOffset, i))
|
||||
uncompressedData = bytes(uncompressedBytes)
|
||||
compressedData = ZstdCompressor(compressionLevel).compress(
|
||||
uncompressedData)
|
||||
compressedSize = len(compressedData)
|
||||
blockBytes = bytearray(4 + compressedSize)
|
||||
blockBytes[0:4] = compressedSize.to_bytes(4, _endian)
|
||||
blockBytes[4:4 + compressedSize] = compressedData
|
||||
file.write(blockBytes)
|
||||
|
||||
@staticmethod
|
||||
def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]:
|
||||
blockOffset = file.tell()
|
||||
compressedSize = _uint32Struct.unpack(file.read(4))[0]
|
||||
compressedData = file.read(compressedSize)
|
||||
decompressedData = ZstdDecompressor().decompress(compressedData)
|
||||
|
||||
memoryView = memoryview(decompressedData)
|
||||
count = _uint32Struct.unpack(memoryView[0:4])[0]
|
||||
for i in range(count):
|
||||
yield RowPosition(blockOffset, i)
|
||||
|
||||
|
||||
class ZstRowInfo:
|
||||
structSize = 8
|
||||
offset: int
|
||||
size: int
|
||||
|
||||
def __init__(self, offset: int, size: int):
|
||||
self.offset = offset
|
||||
self.size = size
|
||||
|
||||
@staticmethod
|
||||
def read(bytes: bytes, position: int) -> ZstRowInfo:
|
||||
offset, size = _uint32X2Struct.unpack(
|
||||
bytes[position: position + ZstRowInfo.structSize])
|
||||
return ZstRowInfo(offset, size)
|
||||
|
||||
def write(self, bytes: bytearray, position: int) -> None:
|
||||
bytes[position + 0: position + 4] = self.offset.to_bytes(4, _endian)
|
||||
bytes[position + 4: position + 8] = self.size.to_bytes(4, _endian)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowPosition:
|
||||
blockOffset: int
|
||||
rowIndex: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowIndex:
|
||||
withinBlockIndex: int
|
||||
originalRowIndex: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowPositionGroup:
|
||||
blockOffset: int
|
||||
rowIndices: list[RowIndex]
|
Loading…
Add table
Add a link
Reference in a new issue