mirror of
https://github.com/autistic-symposium/sec-pentesting-toolkit.git
synced 2025-04-25 10:09:08 -04:00
1032 lines
42 KiB
Python
Executable File
1032 lines
42 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
__description__ = 'pdf-parser, use it to parse a PDF document'
|
|
__author__ = 'Didier Stevens'
|
|
__version__ = '0.4.3'
|
|
__date__ = '2013/09/18'
|
|
__minimum_python_version__ = (2, 5, 1)
|
|
__maximum_python_version__ = (3, 3, 0)
|
|
|
|
"""
|
|
Source code put in public domain by Didier Stevens, no Copyright
|
|
https://DidierStevens.com
|
|
Use at your own risk
|
|
|
|
History:
|
|
2008/05/02: continue
|
|
2008/05/03: continue
|
|
2008/06/02: streams
|
|
2008/10/19: refactor, grep & extract functionality
|
|
2008/10/20: reference
|
|
2008/10/21: cleanup
|
|
2008/11/12: V0.3 dictionary parser
|
|
2008/11/13: option elements
|
|
2008/11/14: continue
|
|
2009/05/05: added /ASCIIHexDecode support (thanks Justin Prosco)
|
|
2009/05/11: V0.3.1 updated usage, added --verbose and --extract
|
|
2009/07/16: V0.3.2 Added Canonicalize (thanks Justin Prosco)
|
|
2009/07/18: bugfix EqualCanonical
|
|
2009/07/24: V0.3.3 Added --hash option
|
|
2009/07/25: EqualCanonical for option --type, added option --nocanonicalizedoutput
|
|
2009/07/28: V0.3.4 Added ASCII85Decode support
|
|
2009/08/01: V0.3.5 Updated ASCIIHexDecode to support whitespace obfuscation
|
|
2009/08/30: V0.3.6 TestPythonVersion
|
|
2010/01/08: V0.3.7 Added RLE and LZW support (thanks pARODY); added dump option
|
|
2010/01/09: Fixed parsing of incomplete startxref
|
|
2010/09/22: V0.3.8 Changed dump option, updated PrettyPrint, added debug option
|
|
2011/12/17: fixed bugs empty objects
|
|
2012/03/11: V0.3.9 fixed bugs double nested [] in PrettyPrintSub (thanks kurt)
|
|
2013/01/11: V0.3.10 Extract and dump bug fixes by Priit; added content option
|
|
2013/02/16: Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas; xrange replaced with range
|
|
2013/02/16: V0.4.0 added http/https support; added error handling for missing file or URL; ; added support for ZIP file with password 'infected'
|
|
2013/03/13: V0.4.1 fixes for Python 3
|
|
2013/04/11: V0.4.2 modified PrettyPrintSub for strings with unprintable characters
|
|
2013/05/04: Added options searchstream, unfiltered, casesensitive, regex
|
|
2013/09/18: V0.4.3 fixed regression bug -w option
|
|
|
|
Todo:
|
|
- handle printf todo
|
|
- fix PrettyPrint
|
|
- support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D.pdf.vir
|
|
|
|
"""
|
|
|
|
import re
|
|
import optparse
|
|
import zlib
|
|
import binascii
|
|
import hashlib
|
|
import sys
|
|
import zipfile
|
|
if sys.version_info[0] >= 3:
|
|
from io import StringIO
|
|
import urllib.request
|
|
urllib23 = urllib.request
|
|
else:
|
|
from cStringIO import StringIO
|
|
import urllib2
|
|
urllib23 = urllib2
|
|
|
|
CHAR_WHITESPACE = 1
|
|
CHAR_DELIMITER = 2
|
|
CHAR_REGULAR = 3
|
|
|
|
CONTEXT_NONE = 1
|
|
CONTEXT_OBJ = 2
|
|
CONTEXT_XREF = 3
|
|
CONTEXT_TRAILER = 4
|
|
|
|
PDF_ELEMENT_COMMENT = 1
|
|
PDF_ELEMENT_INDIRECT_OBJECT = 2
|
|
PDF_ELEMENT_XREF = 3
|
|
PDF_ELEMENT_TRAILER = 4
|
|
PDF_ELEMENT_STARTXREF = 5
|
|
PDF_ELEMENT_MALFORMED = 6
|
|
|
|
#Convert 2 Bytes If Python 3
|
|
def C2BIP3(string):
|
|
if sys.version_info[0] > 2:
|
|
return bytes([ord(x) for x in string])
|
|
else:
|
|
return string
|
|
|
|
def CopyWithoutWhiteSpace(content):
|
|
result = []
|
|
for token in content:
|
|
if token[0] != CHAR_WHITESPACE:
|
|
result.append(token)
|
|
return result
|
|
|
|
def Obj2Str(content):
|
|
return ''.join(map(lambda x: repr(x[1])[1:-1], CopyWithoutWhiteSpace(content)))
|
|
|
|
class cPDFDocument:
|
|
def __init__(self, file):
|
|
self.file = file
|
|
if file.lower().startswith('http://') or file.lower().startswith('https://'):
|
|
try:
|
|
if sys.hexversion >= 0x020601F0:
|
|
self.infile = urllib23.urlopen(file, timeout=5)
|
|
else:
|
|
self.infile = urllib23.urlopen(file)
|
|
except urllib23.HTTPError:
|
|
print('Error accessing URL %s' % file)
|
|
print(sys.exc_info()[1])
|
|
sys.exit()
|
|
elif file.lower().endswith('.zip'):
|
|
try:
|
|
self.zipfile = zipfile.ZipFile(file, 'r')
|
|
self.infile = self.zipfile.open(self.zipfile.infolist()[0], 'r', C2BIP3('infected'))
|
|
except:
|
|
print('Error opening file %s' % file)
|
|
print(sys.exc_info()[1])
|
|
sys.exit()
|
|
else:
|
|
try:
|
|
self.infile = open(file, 'rb')
|
|
except:
|
|
print('Error opening file %s' % file)
|
|
print(sys.exc_info()[1])
|
|
sys.exit()
|
|
self.ungetted = []
|
|
self.position = -1
|
|
|
|
def byte(self):
|
|
if len(self.ungetted) != 0:
|
|
self.position += 1
|
|
return self.ungetted.pop()
|
|
inbyte = self.infile.read(1)
|
|
if not inbyte or inbyte == '':
|
|
self.infile.close()
|
|
return None
|
|
self.position += 1
|
|
return ord(inbyte)
|
|
|
|
def unget(self, byte):
|
|
self.position -= 1
|
|
self.ungetted.append(byte)
|
|
|
|
def CharacterClass(byte):
|
|
if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32:
|
|
return CHAR_WHITESPACE
|
|
if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25:
|
|
return CHAR_DELIMITER
|
|
return CHAR_REGULAR
|
|
|
|
def IsNumeric(str):
|
|
return re.match('^[0-9]+', str)
|
|
|
|
class cPDFTokenizer:
|
|
def __init__(self, file):
|
|
self.oPDF = cPDFDocument(file)
|
|
self.ungetted = []
|
|
|
|
def Token(self):
|
|
if len(self.ungetted) != 0:
|
|
return self.ungetted.pop()
|
|
if self.oPDF == None:
|
|
return None
|
|
self.byte = self.oPDF.byte()
|
|
if self.byte == None:
|
|
self.oPDF = None
|
|
return None
|
|
elif CharacterClass(self.byte) == CHAR_WHITESPACE:
|
|
file_str = StringIO()
|
|
while self.byte != None and CharacterClass(self.byte) == CHAR_WHITESPACE:
|
|
file_str.write(chr(self.byte))
|
|
self.byte = self.oPDF.byte()
|
|
if self.byte != None:
|
|
self.oPDF.unget(self.byte)
|
|
else:
|
|
self.oPDF = None
|
|
self.token = file_str.getvalue()
|
|
return (CHAR_WHITESPACE, self.token)
|
|
elif CharacterClass(self.byte) == CHAR_REGULAR:
|
|
file_str = StringIO()
|
|
while self.byte != None and CharacterClass(self.byte) == CHAR_REGULAR:
|
|
file_str.write(chr(self.byte))
|
|
self.byte = self.oPDF.byte()
|
|
if self.byte != None:
|
|
self.oPDF.unget(self.byte)
|
|
else:
|
|
self.oPDF = None
|
|
self.token = file_str.getvalue()
|
|
return (CHAR_REGULAR, self.token)
|
|
else:
|
|
if self.byte == 0x3C:
|
|
self.byte = self.oPDF.byte()
|
|
if self.byte == 0x3C:
|
|
return (CHAR_DELIMITER, '<<')
|
|
else:
|
|
self.oPDF.unget(self.byte)
|
|
return (CHAR_DELIMITER, '<')
|
|
elif self.byte == 0x3E:
|
|
self.byte = self.oPDF.byte()
|
|
if self.byte == 0x3E:
|
|
return (CHAR_DELIMITER, '>>')
|
|
else:
|
|
self.oPDF.unget(self.byte)
|
|
return (CHAR_DELIMITER, '>')
|
|
elif self.byte == 0x25:
|
|
file_str = StringIO()
|
|
while self.byte != None:
|
|
file_str.write(chr(self.byte))
|
|
if self.byte == 10 or self.byte == 13:
|
|
self.byte = self.oPDF.byte()
|
|
break
|
|
self.byte = self.oPDF.byte()
|
|
if self.byte != None:
|
|
if self.byte == 10:
|
|
file_str.write(chr(self.byte))
|
|
else:
|
|
self.oPDF.unget(self.byte)
|
|
else:
|
|
self.oPDF = None
|
|
self.token = file_str.getvalue()
|
|
return (CHAR_DELIMITER, self.token)
|
|
return (CHAR_DELIMITER, chr(self.byte))
|
|
|
|
def TokenIgnoreWhiteSpace(self):
|
|
token = self.Token()
|
|
while token != None and token[0] == CHAR_WHITESPACE:
|
|
token = self.Token()
|
|
return token
|
|
|
|
def unget(self, byte):
|
|
self.ungetted.append(byte)
|
|
|
|
class cPDFParser:
|
|
def __init__(self, file, verbose=False, extract=None):
|
|
self.context = CONTEXT_NONE
|
|
self.content = []
|
|
self.oPDFTokenizer = cPDFTokenizer(file)
|
|
self.verbose = verbose
|
|
self.extract = extract
|
|
|
|
def GetObject(self):
|
|
while True:
|
|
if self.context == CONTEXT_OBJ:
|
|
self.token = self.oPDFTokenizer.Token()
|
|
else:
|
|
self.token = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
|
|
if self.token:
|
|
if self.token[0] == CHAR_DELIMITER:
|
|
if self.token[1][0] == '%':
|
|
if self.context == CONTEXT_OBJ:
|
|
self.content.append(self.token)
|
|
else:
|
|
return cPDFElementComment(self.token[1])
|
|
elif self.token[1] == '/':
|
|
self.token2 = self.oPDFTokenizer.Token()
|
|
if self.token2[0] == CHAR_REGULAR:
|
|
if self.context != CONTEXT_NONE:
|
|
self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
|
|
elif self.verbose:
|
|
print('todo 1: %s' % (self.token[1] + self.token2[1]))
|
|
else:
|
|
self.oPDFTokenizer.unget(self.token2)
|
|
if self.context != CONTEXT_NONE:
|
|
self.content.append(self.token)
|
|
elif self.verbose:
|
|
print('todo 2: %d %s' % (self.token[0], repr(self.token[1])))
|
|
elif self.context != CONTEXT_NONE:
|
|
self.content.append(self.token)
|
|
elif self.verbose:
|
|
print('todo 3: %d %s' % (self.token[0], repr(self.token[1])))
|
|
elif self.token[0] == CHAR_WHITESPACE:
|
|
if self.context != CONTEXT_NONE:
|
|
self.content.append(self.token)
|
|
elif self.verbose:
|
|
print('todo 4: %d %s' % (self.token[0], repr(self.token[1])))
|
|
else:
|
|
if self.context == CONTEXT_OBJ:
|
|
if self.token[1] == 'endobj':
|
|
self.oPDFElementIndirectObject = cPDFElementIndirectObject(self.objectId, self.objectVersion, self.content)
|
|
self.context = CONTEXT_NONE
|
|
self.content = []
|
|
return self.oPDFElementIndirectObject
|
|
else:
|
|
self.content.append(self.token)
|
|
elif self.context == CONTEXT_TRAILER:
|
|
if self.token[1] == 'startxref' or self.token[1] == 'xref':
|
|
self.oPDFElementTrailer = cPDFElementTrailer(self.content)
|
|
self.oPDFTokenizer.unget(self.token)
|
|
self.context = CONTEXT_NONE
|
|
self.content = []
|
|
return self.oPDFElementTrailer
|
|
else:
|
|
self.content.append(self.token)
|
|
elif self.context == CONTEXT_XREF:
|
|
if self.token[1] == 'trailer' or self.token[1] == 'xref':
|
|
self.oPDFElementXref = cPDFElementXref(self.content)
|
|
self.oPDFTokenizer.unget(self.token)
|
|
self.context = CONTEXT_NONE
|
|
self.content = []
|
|
return self.oPDFElementXref
|
|
else:
|
|
self.content.append(self.token)
|
|
else:
|
|
if IsNumeric(self.token[1]):
|
|
self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
|
|
if IsNumeric(self.token2[1]):
|
|
self.token3 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
|
|
if self.token3[1] == 'obj':
|
|
self.objectId = eval(self.token[1])
|
|
self.objectVersion = eval(self.token2[1])
|
|
self.context = CONTEXT_OBJ
|
|
else:
|
|
self.oPDFTokenizer.unget(self.token3)
|
|
self.oPDFTokenizer.unget(self.token2)
|
|
if self.verbose:
|
|
print('todo 6: %d %s' % (self.token[0], repr(self.token[1])))
|
|
else:
|
|
self.oPDFTokenizer.unget(self.token2)
|
|
if self.verbose:
|
|
print('todo 7: %d %s' % (self.token[0], repr(self.token[1])))
|
|
elif self.token[1] == 'trailer':
|
|
self.context = CONTEXT_TRAILER
|
|
self.content = [self.token]
|
|
elif self.token[1] == 'xref':
|
|
self.context = CONTEXT_XREF
|
|
self.content = [self.token]
|
|
elif self.token[1] == 'startxref':
|
|
self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
|
|
if self.token2 and IsNumeric(self.token2[1]):
|
|
return cPDFElementStartxref(eval(self.token2[1]))
|
|
else:
|
|
self.oPDFTokenizer.unget(self.token2)
|
|
if self.verbose:
|
|
print('todo 9: %d %s' % (self.token[0], repr(self.token[1])))
|
|
elif self.extract:
|
|
self.bytes = ''
|
|
while self.token:
|
|
self.bytes += self.token[1]
|
|
self.token = self.oPDFTokenizer.Token()
|
|
return cPDFElementMalformed(self.bytes)
|
|
elif self.verbose:
|
|
print('todo 10: %d %s' % (self.token[0], repr(self.token[1])))
|
|
else:
|
|
break
|
|
|
|
class cPDFElementComment:
|
|
def __init__(self, comment):
|
|
self.type = PDF_ELEMENT_COMMENT
|
|
self.comment = comment
|
|
# if re.match('^%PDF-[0-9]\.[0-9]', self.token[1]):
|
|
# print(repr(self.token[1]))
|
|
# elif re.match('^%%EOF', self.token[1]):
|
|
# print(repr(self.token[1]))
|
|
|
|
class cPDFElementXref:
|
|
def __init__(self, content):
|
|
self.type = PDF_ELEMENT_XREF
|
|
self.content = content
|
|
|
|
class cPDFElementTrailer:
|
|
def __init__(self, content):
|
|
self.type = PDF_ELEMENT_TRAILER
|
|
self.content = content
|
|
|
|
def IIf(expr, truepart, falsepart):
|
|
if expr:
|
|
return truepart
|
|
else:
|
|
return falsepart
|
|
|
|
class cPDFElementIndirectObject:
|
|
def __init__(self, id, version, content):
|
|
self.type = PDF_ELEMENT_INDIRECT_OBJECT
|
|
self.id = id
|
|
self.version = version
|
|
self.content = content
|
|
|
|
def GetType(self):
|
|
content = CopyWithoutWhiteSpace(self.content)
|
|
dictionary = 0
|
|
for i in range(0, len(content)):
|
|
if content[i][0] == CHAR_DELIMITER and content[i][1] == '<<':
|
|
dictionary += 1
|
|
if content[i][0] == CHAR_DELIMITER and content[i][1] == '>>':
|
|
dictionary -= 1
|
|
if dictionary == 1 and content[i][0] == CHAR_DELIMITER and EqualCanonical(content[i][1], '/Type') and i < len(content) - 1:
|
|
return content[i+1][1]
|
|
return ''
|
|
|
|
def GetReferences(self):
|
|
content = CopyWithoutWhiteSpace(self.content)
|
|
references = []
|
|
for i in range(0, len(content)):
|
|
if i > 1 and content[i][0] == CHAR_REGULAR and content[i][1] == 'R' and content[i-2][0] == CHAR_REGULAR and IsNumeric(content[i-2][1]) and content[i-1][0] == CHAR_REGULAR and IsNumeric(content[i-1][1]):
|
|
references.append((content[i-2][1], content[i-1][1], content[i][1]))
|
|
return references
|
|
|
|
def References(self, index):
|
|
for ref in self.GetReferences():
|
|
if ref[0] == index:
|
|
return True
|
|
return False
|
|
|
|
def ContainsStream(self):
|
|
for i in range(0, len(self.content)):
|
|
if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
|
|
return self.content[0:i]
|
|
return False
|
|
|
|
def Contains(self, keyword):
|
|
data = ''
|
|
for i in range(0, len(self.content)):
|
|
if self.content[i][1] == 'stream':
|
|
break
|
|
else:
|
|
data += Canonicalize(self.content[i][1])
|
|
return data.upper().find(keyword.upper()) != -1
|
|
|
|
def StreamContains(self, keyword, filter, casesensitive, regex):
|
|
if not self.ContainsStream():
|
|
return False
|
|
streamData = self.Stream(filter)
|
|
if filter and streamData == 'No filters':
|
|
streamData = self.Stream(False)
|
|
if regex:
|
|
return re.search(keyword, streamData, IIf(casesensitive, 0, re.I))
|
|
elif casesensitive:
|
|
return keyword in streamData
|
|
else:
|
|
return keyword.lower() in streamData.lower()
|
|
|
|
def Stream(self, filter=True):
|
|
state = 'start'
|
|
countDirectories = 0
|
|
data = ''
|
|
filters = []
|
|
for i in range(0, len(self.content)):
|
|
if state == 'start':
|
|
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '<<':
|
|
countDirectories += 1
|
|
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '>>':
|
|
countDirectories -= 1
|
|
if countDirectories == 1 and self.content[i][0] == CHAR_DELIMITER and EqualCanonical(self.content[i][1], '/Filter'):
|
|
state = 'filter'
|
|
elif countDirectories == 0 and self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
|
|
state = 'stream-whitespace'
|
|
elif state == 'filter':
|
|
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/':
|
|
filters = [self.content[i][1]]
|
|
state = 'search-stream'
|
|
elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '[':
|
|
state = 'filter-list'
|
|
elif state == 'filter-list':
|
|
if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/':
|
|
filters.append(self.content[i][1])
|
|
elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == ']':
|
|
state = 'search-stream'
|
|
elif state == 'search-stream':
|
|
if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
|
|
state = 'stream-whitespace'
|
|
elif state == 'stream-whitespace':
|
|
if self.content[i][0] != CHAR_WHITESPACE:
|
|
data += self.content[i][1]
|
|
state = 'stream-concat'
|
|
elif state == 'stream-concat':
|
|
if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'endstream':
|
|
if filter:
|
|
return self.Decompress(data, filters)
|
|
else:
|
|
return data
|
|
else:
|
|
data += self.content[i][1]
|
|
else:
|
|
return 'Unexpected filter state'
|
|
return filters
|
|
|
|
def Decompress(self, data, filters):
|
|
for filter in filters:
|
|
if EqualCanonical(filter, '/FlateDecode') or EqualCanonical(filter, '/Fl'):
|
|
try:
|
|
data = FlateDecode(data)
|
|
except zlib.error, e:
|
|
message = 'FlateDecode decompress failed'
|
|
if len(data) > 0 and ord(data[0]) & 0x0F != 8:
|
|
message += ', unexpected compression method: %02x' % ord(data[0])
|
|
return message + '. zlib.error %s' % e.message
|
|
elif EqualCanonical(filter, '/ASCIIHexDecode') or EqualCanonical(filter, '/AHx'):
|
|
try:
|
|
data = ASCIIHexDecode(data)
|
|
except:
|
|
return 'ASCIIHexDecode decompress failed'
|
|
elif EqualCanonical(filter, '/ASCII85Decode') or EqualCanonical(filter, '/A85'):
|
|
try:
|
|
data = ASCII85Decode(data.rstrip('>'))
|
|
except:
|
|
return 'ASCII85Decode decompress failed'
|
|
elif EqualCanonical(filter, '/LZWDecode') or EqualCanonical(filter, '/LZW'):
|
|
try:
|
|
data = LZWDecode(data)
|
|
except:
|
|
return 'LZWDecode decompress failed'
|
|
elif EqualCanonical(filter, '/RunLengthDecode') or EqualCanonical(filter, '/R'):
|
|
try:
|
|
data = RunLengthDecode(data)
|
|
except:
|
|
return 'RunLengthDecode decompress failed'
|
|
# elif i.startswith('/CC') # CCITTFaxDecode
|
|
# elif i.startswith('/DCT') # DCTDecode
|
|
else:
|
|
return 'Unsupported filter: %s' % repr(filters)
|
|
if len(filters) == 0:
|
|
return 'No filters'
|
|
else:
|
|
return data
|
|
|
|
class cPDFElementStartxref:
|
|
def __init__(self, index):
|
|
self.type = PDF_ELEMENT_STARTXREF
|
|
self.index = index
|
|
|
|
class cPDFElementMalformed:
|
|
def __init__(self, content):
|
|
self.type = PDF_ELEMENT_MALFORMED
|
|
self.content = content
|
|
|
|
def TrimLWhiteSpace(data):
|
|
while data != [] and data[0][0] == CHAR_WHITESPACE:
|
|
data = data[1:]
|
|
return data
|
|
|
|
def TrimRWhiteSpace(data):
|
|
while data != [] and data[-1][0] == CHAR_WHITESPACE:
|
|
data = data[:-1]
|
|
return data
|
|
|
|
class cPDFParseDictionary:
|
|
def __init__(self, content, nocanonicalizedoutput):
|
|
self.content = content
|
|
self.nocanonicalizedoutput = nocanonicalizedoutput
|
|
dataTrimmed = TrimLWhiteSpace(TrimRWhiteSpace(self.content))
|
|
if dataTrimmed == []:
|
|
self.parsed = None
|
|
elif self.isOpenDictionary(dataTrimmed[0]) and self.isCloseDictionary(dataTrimmed[-1]):
|
|
self.parsed = self.ParseDictionary(dataTrimmed)[0]
|
|
else:
|
|
self.parsed = None
|
|
|
|
def isOpenDictionary(self, token):
|
|
return token[0] == CHAR_DELIMITER and token[1] == '<<'
|
|
|
|
def isCloseDictionary(self, token):
|
|
return token[0] == CHAR_DELIMITER and token[1] == '>>'
|
|
|
|
def ParseDictionary(self, tokens):
|
|
state = 0 # start
|
|
dictionary = []
|
|
while tokens != []:
|
|
if state == 0:
|
|
if self.isOpenDictionary(tokens[0]):
|
|
state = 1
|
|
else:
|
|
return None, tokens
|
|
elif state == 1:
|
|
if self.isOpenDictionary(tokens[0]):
|
|
pass
|
|
elif self.isCloseDictionary(tokens[0]):
|
|
return dictionary, tokens
|
|
elif tokens[0][0] != CHAR_WHITESPACE:
|
|
key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)
|
|
value = []
|
|
state = 2
|
|
elif state == 2:
|
|
if self.isOpenDictionary(tokens[0]):
|
|
value, tokens = self.ParseDictionary(tokens)
|
|
dictionary.append((key, value))
|
|
state = 1
|
|
elif self.isCloseDictionary(tokens[0]):
|
|
dictionary.append((key, value))
|
|
return dictionary, tokens
|
|
elif value == [] and tokens[0][0] == CHAR_WHITESPACE:
|
|
pass
|
|
elif value == [] and tokens[0][1] == '[':
|
|
value.append(tokens[0][1])
|
|
elif value != [] and value[0] == '[' and tokens[0][1] != ']':
|
|
value.append(tokens[0][1])
|
|
elif value != [] and value[0] == '[' and tokens[0][1] == ']':
|
|
value.append(tokens[0][1])
|
|
dictionary.append((key, value))
|
|
value = []
|
|
state = 1
|
|
elif value != [] and tokens[0][1][0] == '/':
|
|
dictionary.append((key, value))
|
|
key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)
|
|
value = []
|
|
state = 2
|
|
else:
|
|
value.append(ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput))
|
|
tokens = tokens[1:]
|
|
|
|
def retrieve(self):
|
|
return self.parsed
|
|
|
|
def PrettyPrintSub(self, prefix, dictionary):
|
|
if dictionary != None:
|
|
print('%s<<' % prefix)
|
|
for e in dictionary:
|
|
if e[1] == []:
|
|
print('%s %s' % (prefix, e[0]))
|
|
elif type(e[1][0]) == type(''):
|
|
value = ''.join(e[1]).strip()
|
|
reprValue = repr(value)
|
|
if "'" + value + "'" != reprValue:
|
|
value = reprValue
|
|
print('%s %s %s' % (prefix, e[0], value))
|
|
else:
|
|
print('%s %s' % (prefix, e[0]))
|
|
self.PrettyPrintSub(prefix + ' ', e[1])
|
|
print('%s>>' % prefix)
|
|
|
|
def PrettyPrint(self, prefix):
|
|
self.PrettyPrintSub(prefix, self.parsed)
|
|
|
|
def FormatOutput(data, raw):
|
|
if raw:
|
|
if type(data) == type([]):
|
|
return ''.join(map(lambda x: x[1], data))
|
|
else:
|
|
return data
|
|
else:
|
|
return repr(data)
|
|
|
|
def PrintObject(object, options):
|
|
print('obj %d %d' % (object.id, object.version))
|
|
print(' Type: %s' % ConditionalCanonicalize(object.GetType(), options.nocanonicalizedoutput))
|
|
print(' Referencing: %s' % ', '.join(map(lambda x: '%s %s %s' % x, object.GetReferences())))
|
|
dataPrecedingStream = object.ContainsStream()
|
|
oPDFParseDictionary = None
|
|
if dataPrecedingStream:
|
|
print(' Contains stream')
|
|
if options.debug:
|
|
print(' %s' % FormatOutput(dataPrecedingStream, options.raw))
|
|
oPDFParseDictionary = cPDFParseDictionary(dataPrecedingStream, options.nocanonicalizedoutput)
|
|
else:
|
|
if options.debug or options.raw:
|
|
print(' %s' % FormatOutput(object.content, options.raw))
|
|
oPDFParseDictionary = cPDFParseDictionary(object.content, options.nocanonicalizedoutput)
|
|
print('')
|
|
oPDFParseDictionary.PrettyPrint(' ')
|
|
print('')
|
|
if options.filter and not options.dump:
|
|
filtered = object.Stream()
|
|
if filtered == []:
|
|
print(' %s' % FormatOutput(object.content, options.raw))
|
|
else:
|
|
print(' %s' % FormatOutput(filtered, options.raw))
|
|
if options.content:
|
|
if object.ContainsStream():
|
|
stream = object.Stream(False)
|
|
if stream != []:
|
|
print(' %s' % FormatOutput(stream, options.raw))
|
|
else:
|
|
print(''.join([token[1] for token in object.content]))
|
|
|
|
|
|
if options.dump:
|
|
filtered = object.Stream(options.filter == True)
|
|
if filtered == []:
|
|
filtered = ''
|
|
try:
|
|
fDump = open(options.dump, 'wb')
|
|
try:
|
|
fDump.write(C2BIP3(filtered))
|
|
except:
|
|
print('Error writing file %s' % options.dump)
|
|
fDump.close()
|
|
except:
|
|
print('Error writing file %s' % options.dump)
|
|
print('')
|
|
return
|
|
|
|
def Canonicalize(sIn):
|
|
if sIn == '':
|
|
return sIn
|
|
elif sIn[0] != '/':
|
|
return sIn
|
|
elif sIn.find('#') == -1:
|
|
return sIn
|
|
else:
|
|
i = 0
|
|
iLen = len(sIn)
|
|
sCanonical = ''
|
|
while i < iLen:
|
|
if sIn[i] == '#' and i < iLen - 2:
|
|
try:
|
|
sCanonical += chr(int(sIn[i+1:i+3], 16))
|
|
i += 2
|
|
except:
|
|
sCanonical += sIn[i]
|
|
else:
|
|
sCanonical += sIn[i]
|
|
i += 1
|
|
return sCanonical
|
|
|
|
def EqualCanonical(s1, s2):
|
|
return Canonicalize(s1) == s2
|
|
|
|
def ConditionalCanonicalize(sIn, nocanonicalizedoutput):
|
|
if nocanonicalizedoutput:
|
|
return sIn
|
|
else:
|
|
return Canonicalize(sIn)
|
|
|
|
# http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer/pdfminer/ascii85.py
|
|
def ASCII85Decode(data):
|
|
import struct
|
|
n = b = 0
|
|
out = ''
|
|
for c in data:
|
|
if '!' <= c and c <= 'u':
|
|
n += 1
|
|
b = b*85+(ord(c)-33)
|
|
if n == 5:
|
|
out += struct.pack('>L',b)
|
|
n = b = 0
|
|
elif c == 'z':
|
|
assert n == 0
|
|
out += '\0\0\0\0'
|
|
elif c == '~':
|
|
if n:
|
|
for _ in range(5-n):
|
|
b = b*85+84
|
|
out += struct.pack('>L',b)[:n-1]
|
|
break
|
|
return out
|
|
|
|
def ASCIIHexDecode(data):
|
|
return binascii.unhexlify(''.join([c for c in data if c not in ' \t\n\r']).rstrip('>'))
|
|
|
|
def FlateDecode(data):
|
|
return zlib.decompress(data)
|
|
|
|
def RunLengthDecode(data):
|
|
f = StringIO(data)
|
|
decompressed = ''
|
|
runLength = ord(f.read(1))
|
|
while runLength:
|
|
if runLength < 128:
|
|
decompressed += f.read(runLength + 1)
|
|
if runLength > 128:
|
|
decompressed += f.read(1) * (257 - runLength)
|
|
if runLength == 128:
|
|
break
|
|
runLength = ord(f.read(1))
|
|
# return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)), data)
|
|
return decompressed
|
|
|
|
#### LZW code sourced from pdfminer
|
|
# Copyright (c) 2004-2009 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
|
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
|
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
|
|
class LZWDecoder(object):
|
|
def __init__(self, fp):
|
|
self.fp = fp
|
|
self.buff = 0
|
|
self.bpos = 8
|
|
self.nbits = 9
|
|
self.table = None
|
|
self.prevbuf = None
|
|
return
|
|
|
|
def readbits(self, bits):
|
|
v = 0
|
|
while 1:
|
|
# the number of remaining bits we can get from the current buffer.
|
|
r = 8-self.bpos
|
|
if bits <= r:
|
|
# |-----8-bits-----|
|
|
# |-bpos-|-bits-| |
|
|
# | |----r----|
|
|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
|
|
self.bpos += bits
|
|
break
|
|
else:
|
|
# |-----8-bits-----|
|
|
# |-bpos-|---bits----...
|
|
# | |----r----|
|
|
v = (v<<r) | (self.buff & ((1<<r)-1))
|
|
bits -= r
|
|
x = self.fp.read(1)
|
|
if not x: raise EOFError
|
|
self.buff = ord(x)
|
|
self.bpos = 0
|
|
return v
|
|
|
|
def feed(self, code):
|
|
x = ''
|
|
if code == 256:
|
|
self.table = [ chr(c) for c in range(256) ] # 0-255
|
|
self.table.append(None) # 256
|
|
self.table.append(None) # 257
|
|
self.prevbuf = ''
|
|
self.nbits = 9
|
|
elif code == 257:
|
|
pass
|
|
elif not self.prevbuf:
|
|
x = self.prevbuf = self.table[code]
|
|
else:
|
|
if code < len(self.table):
|
|
x = self.table[code]
|
|
self.table.append(self.prevbuf+x[0])
|
|
else:
|
|
self.table.append(self.prevbuf+self.prevbuf[0])
|
|
x = self.table[code]
|
|
l = len(self.table)
|
|
if l == 511:
|
|
self.nbits = 10
|
|
elif l == 1023:
|
|
self.nbits = 11
|
|
elif l == 2047:
|
|
self.nbits = 12
|
|
self.prevbuf = x
|
|
return x
|
|
|
|
def run(self):
|
|
while 1:
|
|
try:
|
|
code = self.readbits(self.nbits)
|
|
except EOFError:
|
|
break
|
|
x = self.feed(code)
|
|
yield x
|
|
return
|
|
|
|
####
|
|
|
|
def LZWDecode(data):
|
|
return ''.join(LZWDecoder(StringIO(data)).run())
|
|
|
|
def Main():
|
|
"""pdf-parser, use it to parse a PDF document
|
|
"""
|
|
|
|
oParser = optparse.OptionParser(usage='usage: %prog [options] pdf-file|zip-file|url\n' + __description__, version='%prog ' + __version__)
|
|
oParser.add_option('-s', '--search', help='string to search in indirect objects (except streams)')
|
|
oParser.add_option('-f', '--filter', action='store_true', default=False, help='pass stream object through filters (FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode and RunLengthDecode only)')
|
|
oParser.add_option('-o', '--object', help='id of indirect object to select (version independent)')
|
|
oParser.add_option('-r', '--reference', help='id of indirect object being referenced (version independent)')
|
|
oParser.add_option('-e', '--elements', help='type of elements to select (cxtsi)')
|
|
oParser.add_option('-w', '--raw', action='store_true', default=False, help='raw output for data and filters')
|
|
oParser.add_option('-a', '--stats', action='store_true', default=False, help='display stats for pdf document')
|
|
oParser.add_option('-t', '--type', help='type of indirect object to select')
|
|
oParser.add_option('-v', '--verbose', action='store_true', default=False, help='display malformed PDF elements')
|
|
oParser.add_option('-x', '--extract', help='filename to extract malformed content to')
|
|
oParser.add_option('-H', '--hash', action='store_true', default=False, help='display hash of objects')
|
|
oParser.add_option('-n', '--nocanonicalizedoutput', action='store_true', default=False, help='do not canonicalize the output')
|
|
oParser.add_option('-d', '--dump', help='filename to dump stream content to')
|
|
oParser.add_option('-D', '--debug', action='store_true', default=False, help='display debug info')
|
|
oParser.add_option('-c', '--content', action='store_true', default=False, help='display the content for objects without streams or with streams without filters')
|
|
oParser.add_option('--searchstream', help='string to search in streams')
|
|
oParser.add_option('--unfiltered', action='store_true', default=False, help='search in unfiltered streams')
|
|
oParser.add_option('--casesensitive', action='store_true', default=False, help='case sensitive search in streams')
|
|
oParser.add_option('--regex', action='store_true', default=False, help='use regex to search in streams')
|
|
(options, args) = oParser.parse_args()
|
|
|
|
if len(args) != 1:
|
|
oParser.print_help()
|
|
print('')
|
|
print(' %s' % __description__)
|
|
print(' Source code put in the public domain by Didier Stevens, no Copyright')
|
|
print(' Use at your own risk')
|
|
print(' https://DidierStevens.com')
|
|
|
|
else:
|
|
oPDFParser = cPDFParser(args[0], options.verbose, options.extract)
|
|
cntComment = 0
|
|
cntXref = 0
|
|
cntTrailer = 0
|
|
cntStartXref = 0
|
|
cntIndirectObject = 0
|
|
dicObjectTypes = {}
|
|
|
|
selectComment = False
|
|
selectXref = False
|
|
selectTrailer = False
|
|
selectStartXref = False
|
|
selectIndirectObject = False
|
|
if options.elements:
|
|
for c in options.elements:
|
|
if c == 'c':
|
|
selectComment = True
|
|
elif c == 'x':
|
|
selectXref = True
|
|
elif c == 't':
|
|
selectTrailer = True
|
|
elif c == 's':
|
|
selectStartXref = True
|
|
elif c == 'i':
|
|
selectIndirectObject = True
|
|
else:
|
|
print('Error: unknown --elements value %s' % c)
|
|
return
|
|
else:
|
|
selectIndirectObject = True
|
|
if not options.search and not options.object and not options.reference and not options.type and not options.searchstream:
|
|
selectComment = True
|
|
selectXref = True
|
|
selectTrailer = True
|
|
selectStartXref = True
|
|
|
|
if options.type == '-':
|
|
optionsType = ''
|
|
else:
|
|
optionsType = options.type
|
|
|
|
while True:
|
|
object = oPDFParser.GetObject()
|
|
if object != None:
|
|
if options.stats:
|
|
if object.type == PDF_ELEMENT_COMMENT:
|
|
cntComment += 1
|
|
elif object.type == PDF_ELEMENT_XREF:
|
|
cntXref += 1
|
|
elif object.type == PDF_ELEMENT_TRAILER:
|
|
cntTrailer += 1
|
|
elif object.type == PDF_ELEMENT_STARTXREF:
|
|
cntStartXref += 1
|
|
elif object.type == PDF_ELEMENT_INDIRECT_OBJECT:
|
|
cntIndirectObject += 1
|
|
type = object.GetType()
|
|
if not type in dicObjectTypes:
|
|
dicObjectTypes[type] = [object.id]
|
|
else:
|
|
dicObjectTypes[type].append(object.id)
|
|
else:
|
|
if object.type == PDF_ELEMENT_COMMENT and selectComment:
|
|
print('PDF Comment %s' % FormatOutput(object.comment, options.raw))
|
|
print('')
|
|
elif object.type == PDF_ELEMENT_XREF and selectXref:
|
|
if options.debug:
|
|
print('xref %s' % FormatOutput(object.content, options.raw))
|
|
else:
|
|
print('xref')
|
|
print('')
|
|
elif object.type == PDF_ELEMENT_TRAILER and selectTrailer:
|
|
oPDFParseDictionary = cPDFParseDictionary(object.content[1:], options.nocanonicalizedoutput)
|
|
if oPDFParseDictionary == None:
|
|
print('trailer %s' % FormatOutput(object.content, options.raw))
|
|
else:
|
|
print('trailer')
|
|
oPDFParseDictionary.PrettyPrint(' ')
|
|
print('')
|
|
elif object.type == PDF_ELEMENT_STARTXREF and selectStartXref:
|
|
print('startxref %d' % object.index)
|
|
print('')
|
|
elif object.type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject:
|
|
if options.search:
|
|
if object.Contains(options.search):
|
|
PrintObject(object, options)
|
|
elif options.object:
|
|
if object.id == eval(options.object):
|
|
PrintObject(object, options)
|
|
elif options.reference:
|
|
if object.References(options.reference):
|
|
PrintObject(object, options)
|
|
elif options.type:
|
|
if EqualCanonical(object.GetType(), optionsType):
|
|
PrintObject(object, options)
|
|
elif options.hash:
|
|
print('obj %d %d' % (object.id, object.version))
|
|
rawContent = FormatOutput(object.content, True)
|
|
print(' len: %d md5: %s' % (len(rawContent), hashlib.md5(rawContent).hexdigest()))
|
|
print('')
|
|
elif options.searchstream:
|
|
if object.StreamContains(options.searchstream, not options.unfiltered, options.casesensitive, options.regex):
|
|
PrintObject(object, options)
|
|
else:
|
|
PrintObject(object, options)
|
|
elif object.type == PDF_ELEMENT_MALFORMED:
|
|
try:
|
|
fExtract = open(options.extract, 'wb')
|
|
try:
|
|
fExtract.write(C2BIP3(object.content))
|
|
except:
|
|
print('Error writing file %s' % options.extract)
|
|
fExtract.close()
|
|
except:
|
|
print('Error writing file %s' % options.extract)
|
|
else:
|
|
break
|
|
|
|
if options.stats:
|
|
print('Comment: %s' % cntComment)
|
|
print('XREF: %s' % cntXref)
|
|
print('Trailer: %s' % cntTrailer)
|
|
print('StartXref: %s' % cntStartXref)
|
|
print('Indirect object: %s' % cntIndirectObject)
|
|
names = dicObjectTypes.keys()
|
|
names.sort()
|
|
for key in names:
|
|
print(' %s %d: %s' % (key, len(dicObjectTypes[key]), ', '.join(map(lambda x: '%d' % x, dicObjectTypes[key]))))
|
|
|
|
def TestPythonVersion(enforceMaximumVersion=False, enforceMinimumVersion=False):
|
|
if sys.version_info[0:3] > __maximum_python_version__:
|
|
if enforceMaximumVersion:
|
|
print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
|
|
print('Please use Python version %d.%d.%d' % __maximum_python_version__)
|
|
sys.exit()
|
|
else:
|
|
print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
|
|
print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__)
|
|
if sys.version_info[0:3] < __minimum_python_version__:
|
|
if enforceMinimumVersion:
|
|
print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
|
|
print('Please use Python version %d.%d.%d' % __maximum_python_version__)
|
|
sys.exit()
|
|
else:
|
|
print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
|
|
print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__)
|
|
|
|
if __name__ == '__main__':
|
|
TestPythonVersion()
|
|
Main()
|