mirror of
https://github.com/autistic-symposium/sec-pentesting-toolkit.git
synced 2025-04-25 10:09:08 -04:00
715 lines
29 KiB
Python
Executable File
715 lines
29 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
__description__ = 'Tool to test a PDF file'
|
|
__author__ = 'Didier Stevens'
|
|
__version__ = '0.1.2'
|
|
__date__ = '2013/03/13'
|
|
|
|
"""
|
|
|
|
Tool to test a PDF file
|
|
|
|
Source code put in public domain by Didier Stevens, no Copyright
|
|
https://DidierStevens.com
|
|
Use at your own risk
|
|
|
|
History:
|
|
2009/03/27: start
|
|
2009/03/28: scan option
|
|
2009/03/29: V0.0.2: xml output
|
|
2009/03/31: V0.0.3: /ObjStm suggested by Dion
|
|
2009/04/02: V0.0.4: added ErrorMessage
|
|
2009/04/20: V0.0.5: added Dates
|
|
2009/04/21: V0.0.6: added entropy
|
|
2009/04/22: added disarm
|
|
2009/04/29: finished disarm
|
|
2009/05/13: V0.0.7: added cPDFEOF
|
|
2009/07/24: V0.0.8: added /AcroForm and /RichMedia, simplified %PDF header regex, extra date format (without TZ)
|
|
2009/07/25: added input redirection, option --force
|
|
2009/10/13: V0.0.9: added detection for CVE-2009-3459; added /RichMedia to disarm
|
|
2010/01/11: V0.0.10: relaxed %PDF header checking
|
|
2010/04/28: V0.0.11: added /Launch
|
|
2010/09/21: V0.0.12: fixed cntCharsAfterLastEOF bug; fix by Russell Holloway
|
|
2011/12/29: updated for Python 3, added keyword /EmbeddedFile
|
|
2012/03/03: added PDFiD2JSON; coded by Brandon Dixon
|
|
2013/02/10: V0.1.0: added http/https support; added support for ZIP file with password 'infected'
|
|
2013/03/11: V0.1.1: fixes for Python 3
|
|
2013/03/13: V0.1.2: Added error handling for files; added /XFA
|
|
|
|
Todo:
|
|
- update XML example (entropy, EOF)
|
|
- code review, cleanup
|
|
"""
|
|
|
|
import optparse
|
|
import os
|
|
import re
|
|
import xml.dom.minidom
|
|
import traceback
|
|
import math
|
|
import operator
|
|
import os.path
|
|
import sys
|
|
import json
|
|
import zipfile
|
|
try:
|
|
import urllib2
|
|
urllib23 = urllib2
|
|
except:
|
|
import urllib.request
|
|
urllib23 = urllib.request
|
|
|
|
#Convert 2 Bytes If Python 3
|
|
def C2BIP3(string):
|
|
if sys.version_info[0] > 2:
|
|
return bytes([ord(x) for x in string])
|
|
else:
|
|
return string
|
|
|
|
class cBinaryFile:
|
|
def __init__(self, file):
|
|
self.file = file
|
|
if file == '':
|
|
self.infile = sys.stdin
|
|
elif file.lower().startswith('http://') or file.lower().startswith('https://'):
|
|
try:
|
|
if sys.hexversion >= 0x020601F0:
|
|
self.infile = urllib23.urlopen(file, timeout=5)
|
|
else:
|
|
self.infile = urllib23.urlopen(file)
|
|
except urllib23.HTTPError:
|
|
print('Error accessing URL %s' % file)
|
|
print(sys.exc_info()[1])
|
|
sys.exit()
|
|
elif file.lower().endswith('.zip'):
|
|
try:
|
|
self.zipfile = zipfile.ZipFile(file, 'r')
|
|
self.infile = self.zipfile.open(self.zipfile.infolist()[0], 'r', C2BIP3('infected'))
|
|
except:
|
|
print('Error opening file %s' % file)
|
|
print(sys.exc_info()[1])
|
|
sys.exit()
|
|
else:
|
|
try:
|
|
self.infile = open(file, 'rb')
|
|
except:
|
|
print('Error opening file %s' % file)
|
|
print(sys.exc_info()[1])
|
|
sys.exit()
|
|
self.ungetted = []
|
|
|
|
def byte(self):
|
|
if len(self.ungetted) != 0:
|
|
return self.ungetted.pop()
|
|
inbyte = self.infile.read(1)
|
|
if not inbyte or inbyte == '':
|
|
self.infile.close()
|
|
return None
|
|
return ord(inbyte)
|
|
|
|
def bytes(self, size):
|
|
if size <= len(self.ungetted):
|
|
result = self.ungetted[0:size]
|
|
del self.ungetted[0:size]
|
|
return result
|
|
inbytes = self.infile.read(size - len(self.ungetted))
|
|
if inbytes == '':
|
|
self.infile.close()
|
|
if type(inbytes) == type(''):
|
|
result = self.ungetted + [ord(b) for b in inbytes]
|
|
else:
|
|
result = self.ungetted + [b for b in inbytes]
|
|
self.ungetted = []
|
|
return result
|
|
|
|
def unget(self, byte):
|
|
self.ungetted.append(byte)
|
|
|
|
def ungets(self, bytes):
|
|
bytes.reverse()
|
|
self.ungetted.extend(bytes)
|
|
|
|
class cPDFDate:
|
|
def __init__(self):
|
|
self.state = 0
|
|
|
|
def parse(self, char):
|
|
if char == 'D':
|
|
self.state = 1
|
|
return None
|
|
elif self.state == 1:
|
|
if char == ':':
|
|
self.state = 2
|
|
self.digits1 = ''
|
|
else:
|
|
self.state = 0
|
|
return None
|
|
elif self.state == 2:
|
|
if len(self.digits1) < 14:
|
|
if char >= '0' and char <= '9':
|
|
self.digits1 += char
|
|
return None
|
|
else:
|
|
self.state = 0
|
|
return None
|
|
elif char == '+' or char == '-' or char == 'Z':
|
|
self.state = 3
|
|
self.digits2 = ''
|
|
self.TZ = char
|
|
return None
|
|
elif char == '"':
|
|
self.state = 0
|
|
self.date = 'D:' + self.digits1
|
|
return self.date
|
|
elif char < '0' or char > '9':
|
|
self.state = 0
|
|
self.date = 'D:' + self.digits1
|
|
return self.date
|
|
else:
|
|
self.state = 0
|
|
return None
|
|
elif self.state == 3:
|
|
if len(self.digits2) < 2:
|
|
if char >= '0' and char <= '9':
|
|
self.digits2 += char
|
|
return None
|
|
else:
|
|
self.state = 0
|
|
return None
|
|
elif len(self.digits2) == 2:
|
|
if char == "'":
|
|
self.digits2 += char
|
|
return None
|
|
else:
|
|
self.state = 0
|
|
return None
|
|
elif len(self.digits2) < 5:
|
|
if char >= '0' and char <= '9':
|
|
self.digits2 += char
|
|
if len(self.digits2) == 5:
|
|
self.state = 0
|
|
self.date = 'D:' + self.digits1 + self.TZ + self.digits2
|
|
return self.date
|
|
else:
|
|
return None
|
|
else:
|
|
self.state = 0
|
|
return None
|
|
|
|
def fEntropy(countByte, countTotal):
|
|
x = float(countByte) / countTotal
|
|
if x > 0:
|
|
return - x * math.log(x, 2)
|
|
else:
|
|
return 0.0
|
|
|
|
class cEntropy:
|
|
def __init__(self):
|
|
self.allBucket = [0 for i in range(0, 256)]
|
|
self.streamBucket = [0 for i in range(0, 256)]
|
|
|
|
def add(self, byte, insideStream):
|
|
self.allBucket[byte] += 1
|
|
if insideStream:
|
|
self.streamBucket[byte] += 1
|
|
|
|
def removeInsideStream(self, byte):
|
|
if self.streamBucket[byte] > 0:
|
|
self.streamBucket[byte] -= 1
|
|
|
|
def calc(self):
|
|
self.nonStreamBucket = map(operator.sub, self.allBucket, self.streamBucket)
|
|
allCount = sum(self.allBucket)
|
|
streamCount = sum(self.streamBucket)
|
|
nonStreamCount = sum(self.nonStreamBucket)
|
|
return (allCount, sum(map(lambda x: fEntropy(x, allCount), self.allBucket)), streamCount, sum(map(lambda x: fEntropy(x, streamCount), self.streamBucket)), nonStreamCount, sum(map(lambda x: fEntropy(x, nonStreamCount), self.nonStreamBucket)))
|
|
|
|
class cPDFEOF:
|
|
def __init__(self):
|
|
self.token = ''
|
|
self.cntEOFs = 0
|
|
|
|
def parse(self, char):
|
|
if self.cntEOFs > 0:
|
|
self.cntCharsAfterLastEOF += 1
|
|
if self.token == '' and char == '%':
|
|
self.token += char
|
|
return
|
|
elif self.token == '%' and char == '%':
|
|
self.token += char
|
|
return
|
|
elif self.token == '%%' and char == 'E':
|
|
self.token += char
|
|
return
|
|
elif self.token == '%%E' and char == 'O':
|
|
self.token += char
|
|
return
|
|
elif self.token == '%%EO' and char == 'F':
|
|
self.token += char
|
|
return
|
|
elif self.token == '%%EOF' and (char == '\n' or char == '\r' or char == ' ' or char == '\t'):
|
|
self.cntEOFs += 1
|
|
self.cntCharsAfterLastEOF = 0
|
|
if char == '\n':
|
|
self.token = ''
|
|
else:
|
|
self.token += char
|
|
return
|
|
elif self.token == '%%EOF\r':
|
|
if char == '\n':
|
|
self.cntCharsAfterLastEOF = 0
|
|
self.token = ''
|
|
else:
|
|
self.token = ''
|
|
|
|
def FindPDFHeaderRelaxed(oBinaryFile):
|
|
bytes = oBinaryFile.bytes(1024)
|
|
index = ''.join([chr(byte) for byte in bytes]).find('%PDF')
|
|
if index == -1:
|
|
oBinaryFile.ungets(bytes)
|
|
return ([], None)
|
|
for endHeader in range(index + 4, index + 4 + 10):
|
|
if bytes[endHeader] == 10 or bytes[endHeader] == 13:
|
|
break
|
|
oBinaryFile.ungets(bytes[endHeader:])
|
|
return (bytes[0:endHeader], ''.join([chr(byte) for byte in bytes[index:endHeader]]))
|
|
|
|
def Hexcode2String(char):
|
|
if type(char) == int:
|
|
return '#%02x' % char
|
|
else:
|
|
return char
|
|
|
|
def SwapCase(char):
|
|
if type(char) == int:
|
|
return ord(chr(char).swapcase())
|
|
else:
|
|
return char.swapcase()
|
|
|
|
def HexcodeName2String(hexcodeName):
|
|
return ''.join(map(Hexcode2String, hexcodeName))
|
|
|
|
def SwapName(wordExact):
|
|
return map(SwapCase, wordExact)
|
|
|
|
def UpdateWords(word, wordExact, slash, words, hexcode, allNames, lastName, insideStream, oEntropy, fOut):
|
|
if word != '':
|
|
if slash + word in words:
|
|
words[slash + word][0] += 1
|
|
if hexcode:
|
|
words[slash + word][1] += 1
|
|
elif slash == '/' and allNames:
|
|
words[slash + word] = [1, 0]
|
|
if hexcode:
|
|
words[slash + word][1] += 1
|
|
if slash == '/':
|
|
lastName = slash + word
|
|
if slash == '':
|
|
if word == 'stream':
|
|
insideStream = True
|
|
if word == 'endstream':
|
|
if insideStream == True and oEntropy != None:
|
|
for char in 'endstream':
|
|
oEntropy.removeInsideStream(ord(char))
|
|
insideStream = False
|
|
if fOut != None:
|
|
if slash == '/' and '/' + word in ('/JS', '/JavaScript', '/AA', '/OpenAction', '/JBIG2Decode', '/RichMedia', '/Launch'):
|
|
wordExactSwapped = HexcodeName2String(SwapName(wordExact))
|
|
fOut.write(C2BIP3(wordExactSwapped))
|
|
print('/%s -> /%s' % (HexcodeName2String(wordExact), wordExactSwapped))
|
|
else:
|
|
fOut.write(C2BIP3(HexcodeName2String(wordExact)))
|
|
return ('', [], False, lastName, insideStream)
|
|
|
|
class cCVE_2009_3459:
|
|
def __init__(self):
|
|
self.count = 0
|
|
|
|
def Check(self, lastName, word):
|
|
if (lastName == '/Colors' and word.isdigit() and int(word) > 2^24): # decided to alert when the number of colors is expressed with more than 3 bytes
|
|
self.count += 1
|
|
|
|
def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False):
|
|
"""Example of XML output:
|
|
<PDFiD ErrorOccured="False" ErrorMessage="" Filename="test.pdf" Header="%PDF-1.1" IsPDF="True" Version="0.0.4" Entropy="4.28">
|
|
<Keywords>
|
|
<Keyword Count="7" HexcodeCount="0" Name="obj"/>
|
|
<Keyword Count="7" HexcodeCount="0" Name="endobj"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="stream"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="endstream"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="xref"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="trailer"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="startxref"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="/Page"/>
|
|
<Keyword Count="0" HexcodeCount="0" Name="/Encrypt"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="/JS"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="/JavaScript"/>
|
|
<Keyword Count="0" HexcodeCount="0" Name="/AA"/>
|
|
<Keyword Count="1" HexcodeCount="0" Name="/OpenAction"/>
|
|
<Keyword Count="0" HexcodeCount="0" Name="/JBIG2Decode"/>
|
|
</Keywords>
|
|
<Dates>
|
|
<Date Value="D:20090128132916+01'00" Name="/ModDate"/>
|
|
</Dates>
|
|
</PDFiD>
|
|
"""
|
|
|
|
word = ''
|
|
wordExact = []
|
|
hexcode = False
|
|
lastName = ''
|
|
insideStream = False
|
|
keywords = ('obj',
|
|
'endobj',
|
|
'stream',
|
|
'endstream',
|
|
'xref',
|
|
'trailer',
|
|
'startxref',
|
|
'/Page',
|
|
'/Encrypt',
|
|
'/ObjStm',
|
|
'/JS',
|
|
'/JavaScript',
|
|
'/AA',
|
|
'/OpenAction',
|
|
'/AcroForm',
|
|
'/JBIG2Decode',
|
|
'/RichMedia',
|
|
'/Launch',
|
|
'/EmbeddedFile',
|
|
'/XFA',
|
|
)
|
|
words = {}
|
|
dates = []
|
|
for keyword in keywords:
|
|
words[keyword] = [0, 0]
|
|
slash = ''
|
|
xmlDoc = xml.dom.minidom.getDOMImplementation().createDocument(None, 'PDFiD', None)
|
|
att = xmlDoc.createAttribute('Version')
|
|
att.nodeValue = __version__
|
|
xmlDoc.documentElement.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('Filename')
|
|
att.nodeValue = file
|
|
xmlDoc.documentElement.setAttributeNode(att)
|
|
attErrorOccured = xmlDoc.createAttribute('ErrorOccured')
|
|
xmlDoc.documentElement.setAttributeNode(attErrorOccured)
|
|
attErrorOccured.nodeValue = 'False'
|
|
attErrorMessage = xmlDoc.createAttribute('ErrorMessage')
|
|
xmlDoc.documentElement.setAttributeNode(attErrorMessage)
|
|
attErrorMessage.nodeValue = ''
|
|
|
|
oPDFDate = None
|
|
oEntropy = None
|
|
oPDFEOF = None
|
|
oCVE_2009_3459 = cCVE_2009_3459()
|
|
try:
|
|
attIsPDF = xmlDoc.createAttribute('IsPDF')
|
|
xmlDoc.documentElement.setAttributeNode(attIsPDF)
|
|
oBinaryFile = cBinaryFile(file)
|
|
if extraData:
|
|
oPDFDate = cPDFDate()
|
|
oEntropy = cEntropy()
|
|
oPDFEOF = cPDFEOF()
|
|
(bytesHeader, pdfHeader) = FindPDFHeaderRelaxed(oBinaryFile)
|
|
if disarm:
|
|
(pathfile, extension) = os.path.splitext(file)
|
|
fOut = open(pathfile + '.disarmed' + extension, 'wb')
|
|
for byteHeader in bytesHeader:
|
|
fOut.write(C2BIP3(chr(byteHeader)))
|
|
else:
|
|
fOut = None
|
|
if oEntropy != None:
|
|
for byteHeader in bytesHeader:
|
|
oEntropy.add(byteHeader, insideStream)
|
|
if pdfHeader == None and not force:
|
|
attIsPDF.nodeValue = 'False'
|
|
return xmlDoc
|
|
else:
|
|
if pdfHeader == None:
|
|
attIsPDF.nodeValue = 'False'
|
|
pdfHeader = ''
|
|
else:
|
|
attIsPDF.nodeValue = 'True'
|
|
att = xmlDoc.createAttribute('Header')
|
|
att.nodeValue = repr(pdfHeader[0:10]).strip("'")
|
|
xmlDoc.documentElement.setAttributeNode(att)
|
|
byte = oBinaryFile.byte()
|
|
while byte != None:
|
|
char = chr(byte)
|
|
charUpper = char.upper()
|
|
if charUpper >= 'A' and charUpper <= 'Z' or charUpper >= '0' and charUpper <= '9':
|
|
word += char
|
|
wordExact.append(char)
|
|
elif slash == '/' and char == '#':
|
|
d1 = oBinaryFile.byte()
|
|
if d1 != None:
|
|
d2 = oBinaryFile.byte()
|
|
if d2 != None and (chr(d1) >= '0' and chr(d1) <= '9' or chr(d1).upper() >= 'A' and chr(d1).upper() <= 'F') and (chr(d2) >= '0' and chr(d2) <= '9' or chr(d2).upper() >= 'A' and chr(d2).upper() <= 'F'):
|
|
word += chr(int(chr(d1) + chr(d2), 16))
|
|
wordExact.append(int(chr(d1) + chr(d2), 16))
|
|
hexcode = True
|
|
if oEntropy != None:
|
|
oEntropy.add(d1, insideStream)
|
|
oEntropy.add(d2, insideStream)
|
|
if oPDFEOF != None:
|
|
oPDFEOF.parse(d1)
|
|
oPDFEOF.parse(d2)
|
|
else:
|
|
oBinaryFile.unget(d2)
|
|
oBinaryFile.unget(d1)
|
|
(word, wordExact, hexcode, lastName, insideStream) = UpdateWords(word, wordExact, slash, words, hexcode, allNames, lastName, insideStream, oEntropy, fOut)
|
|
if disarm:
|
|
fOut.write(C2BIP3(char))
|
|
else:
|
|
oBinaryFile.unget(d1)
|
|
(word, wordExact, hexcode, lastName, insideStream) = UpdateWords(word, wordExact, slash, words, hexcode, allNames, lastName, insideStream, oEntropy, fOut)
|
|
if disarm:
|
|
fOut.write(C2BIP3(char))
|
|
else:
|
|
oCVE_2009_3459.Check(lastName, word)
|
|
|
|
(word, wordExact, hexcode, lastName, insideStream) = UpdateWords(word, wordExact, slash, words, hexcode, allNames, lastName, insideStream, oEntropy, fOut)
|
|
if char == '/':
|
|
slash = '/'
|
|
else:
|
|
slash = ''
|
|
if disarm:
|
|
fOut.write(C2BIP3(char))
|
|
|
|
if oPDFDate != None and oPDFDate.parse(char) != None:
|
|
dates.append([oPDFDate.date, lastName])
|
|
|
|
if oEntropy != None:
|
|
oEntropy.add(byte, insideStream)
|
|
|
|
if oPDFEOF != None:
|
|
oPDFEOF.parse(char)
|
|
|
|
byte = oBinaryFile.byte()
|
|
(word, wordExact, hexcode, lastName, insideStream) = UpdateWords(word, wordExact, slash, words, hexcode, allNames, lastName, insideStream, oEntropy, fOut)
|
|
|
|
# check to see if file ended with %%EOF. If so, we can reset charsAfterLastEOF and add one to EOF count. This is never performed in
|
|
# the parse function because it never gets called due to hitting the end of file.
|
|
if byte == None and oPDFEOF != None:
|
|
if oPDFEOF.token == '%%EOF':
|
|
oPDFEOF.cntEOFs += 1
|
|
oPDFEOF.cntCharsAfterLastEOF = 0
|
|
oPDFEOF.token = ''
|
|
|
|
except SystemExit:
|
|
sys.exit()
|
|
except:
|
|
attErrorOccured.nodeValue = 'True'
|
|
attErrorMessage.nodeValue = traceback.format_exc()
|
|
|
|
if disarm:
|
|
fOut.close()
|
|
|
|
attEntropyAll = xmlDoc.createAttribute('TotalEntropy')
|
|
xmlDoc.documentElement.setAttributeNode(attEntropyAll)
|
|
attCountAll = xmlDoc.createAttribute('TotalCount')
|
|
xmlDoc.documentElement.setAttributeNode(attCountAll)
|
|
attEntropyStream = xmlDoc.createAttribute('StreamEntropy')
|
|
xmlDoc.documentElement.setAttributeNode(attEntropyStream)
|
|
attCountStream = xmlDoc.createAttribute('StreamCount')
|
|
xmlDoc.documentElement.setAttributeNode(attCountStream)
|
|
attEntropyNonStream = xmlDoc.createAttribute('NonStreamEntropy')
|
|
xmlDoc.documentElement.setAttributeNode(attEntropyNonStream)
|
|
attCountNonStream = xmlDoc.createAttribute('NonStreamCount')
|
|
xmlDoc.documentElement.setAttributeNode(attCountNonStream)
|
|
if oEntropy != None:
|
|
(countAll, entropyAll , countStream, entropyStream, countNonStream, entropyNonStream) = oEntropy.calc()
|
|
attEntropyAll.nodeValue = '%f' % entropyAll
|
|
attCountAll.nodeValue = '%d' % countAll
|
|
attEntropyStream.nodeValue = '%f' % entropyStream
|
|
attCountStream.nodeValue = '%d' % countStream
|
|
attEntropyNonStream.nodeValue = '%f' % entropyNonStream
|
|
attCountNonStream.nodeValue = '%d' % countNonStream
|
|
else:
|
|
attEntropyAll.nodeValue = ''
|
|
attCountAll.nodeValue = ''
|
|
attEntropyStream.nodeValue = ''
|
|
attCountStream.nodeValue = ''
|
|
attEntropyNonStream.nodeValue = ''
|
|
attCountNonStream.nodeValue = ''
|
|
attCountEOF = xmlDoc.createAttribute('CountEOF')
|
|
xmlDoc.documentElement.setAttributeNode(attCountEOF)
|
|
attCountCharsAfterLastEOF = xmlDoc.createAttribute('CountCharsAfterLastEOF')
|
|
xmlDoc.documentElement.setAttributeNode(attCountCharsAfterLastEOF)
|
|
if oPDFEOF != None:
|
|
attCountEOF.nodeValue = '%d' % oPDFEOF.cntEOFs
|
|
attCountCharsAfterLastEOF.nodeValue = '%d' % oPDFEOF.cntCharsAfterLastEOF
|
|
else:
|
|
attCountEOF.nodeValue = ''
|
|
attCountCharsAfterLastEOF.nodeValue = ''
|
|
|
|
eleKeywords = xmlDoc.createElement('Keywords')
|
|
xmlDoc.documentElement.appendChild(eleKeywords)
|
|
for keyword in keywords:
|
|
eleKeyword = xmlDoc.createElement('Keyword')
|
|
eleKeywords.appendChild(eleKeyword)
|
|
att = xmlDoc.createAttribute('Name')
|
|
att.nodeValue = keyword
|
|
eleKeyword.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('Count')
|
|
att.nodeValue = str(words[keyword][0])
|
|
eleKeyword.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('HexcodeCount')
|
|
att.nodeValue = str(words[keyword][1])
|
|
eleKeyword.setAttributeNode(att)
|
|
eleKeyword = xmlDoc.createElement('Keyword')
|
|
eleKeywords.appendChild(eleKeyword)
|
|
att = xmlDoc.createAttribute('Name')
|
|
att.nodeValue = '/Colors > 2^24'
|
|
eleKeyword.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('Count')
|
|
att.nodeValue = str(oCVE_2009_3459.count)
|
|
eleKeyword.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('HexcodeCount')
|
|
att.nodeValue = str(0)
|
|
eleKeyword.setAttributeNode(att)
|
|
if allNames:
|
|
keys = sorted(words.keys())
|
|
for word in keys:
|
|
if not word in keywords:
|
|
eleKeyword = xmlDoc.createElement('Keyword')
|
|
eleKeywords.appendChild(eleKeyword)
|
|
att = xmlDoc.createAttribute('Name')
|
|
att.nodeValue = word
|
|
eleKeyword.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('Count')
|
|
att.nodeValue = str(words[word][0])
|
|
eleKeyword.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('HexcodeCount')
|
|
att.nodeValue = str(words[word][1])
|
|
eleKeyword.setAttributeNode(att)
|
|
eleDates = xmlDoc.createElement('Dates')
|
|
xmlDoc.documentElement.appendChild(eleDates)
|
|
dates.sort(key=lambda x: x[0])
|
|
for date in dates:
|
|
eleDate = xmlDoc.createElement('Date')
|
|
eleDates.appendChild(eleDate)
|
|
att = xmlDoc.createAttribute('Value')
|
|
att.nodeValue = date[0]
|
|
eleDate.setAttributeNode(att)
|
|
att = xmlDoc.createAttribute('Name')
|
|
att.nodeValue = date[1]
|
|
eleDate.setAttributeNode(att)
|
|
return xmlDoc
|
|
|
|
def PDFiD2String(xmlDoc, force):
|
|
result = 'PDFiD %s %s\n' % (xmlDoc.documentElement.getAttribute('Version'), xmlDoc.documentElement.getAttribute('Filename'))
|
|
if xmlDoc.documentElement.getAttribute('ErrorOccured') == 'True':
|
|
return result + '***Error occured***\n%s\n' % xmlDoc.documentElement.getAttribute('ErrorMessage')
|
|
if not force and xmlDoc.documentElement.getAttribute('IsPDF') == 'False':
|
|
return result + ' Not a PDF document\n'
|
|
result += ' PDF Header: %s\n' % xmlDoc.documentElement.getAttribute('Header')
|
|
for node in xmlDoc.documentElement.getElementsByTagName('Keywords')[0].childNodes:
|
|
result += ' %-16s %7d' % (node.getAttribute('Name'), int(node.getAttribute('Count')))
|
|
if int(node.getAttribute('HexcodeCount')) > 0:
|
|
result += '(%d)' % int(node.getAttribute('HexcodeCount'))
|
|
result += '\n'
|
|
if xmlDoc.documentElement.getAttribute('CountEOF') != '':
|
|
result += ' %-16s %7d\n' % ('%%EOF', int(xmlDoc.documentElement.getAttribute('CountEOF')))
|
|
if xmlDoc.documentElement.getAttribute('CountCharsAfterLastEOF') != '':
|
|
result += ' %-16s %7d\n' % ('After last %%EOF', int(xmlDoc.documentElement.getAttribute('CountCharsAfterLastEOF')))
|
|
for node in xmlDoc.documentElement.getElementsByTagName('Dates')[0].childNodes:
|
|
result += ' %-23s %s\n' % (node.getAttribute('Value'), node.getAttribute('Name'))
|
|
if xmlDoc.documentElement.getAttribute('TotalEntropy') != '':
|
|
result += ' Total entropy: %s (%10s bytes)\n' % (xmlDoc.documentElement.getAttribute('TotalEntropy'), xmlDoc.documentElement.getAttribute('TotalCount'))
|
|
if xmlDoc.documentElement.getAttribute('StreamEntropy') != '':
|
|
result += ' Entropy inside streams: %s (%10s bytes)\n' % (xmlDoc.documentElement.getAttribute('StreamEntropy'), xmlDoc.documentElement.getAttribute('StreamCount'))
|
|
if xmlDoc.documentElement.getAttribute('NonStreamEntropy') != '':
|
|
result += ' Entropy outside streams: %s (%10s bytes)\n' % (xmlDoc.documentElement.getAttribute('NonStreamEntropy'), xmlDoc.documentElement.getAttribute('NonStreamCount'))
|
|
return result
|
|
|
|
def Scan(directory, allNames, extraData, disarm, force):
|
|
try:
|
|
if os.path.isdir(directory):
|
|
for entry in os.listdir(directory):
|
|
Scan(os.path.join(directory, entry), allNames, extraData, disarm, force)
|
|
else:
|
|
result = PDFiD2String(PDFiD(directory, allNames, extraData, disarm, force), force)
|
|
print(result)
|
|
logfile = open('PDFiD.log', 'a')
|
|
logfile.write(result + '\n')
|
|
logfile.close()
|
|
except:
|
|
pass
|
|
|
|
#function derived from: http://blog.9bplus.com/pdfidpy-output-to-json
|
|
def PDFiD2JSON(xmlDoc, force):
|
|
#Get Top Layer Data
|
|
errorOccured = xmlDoc.documentElement.getAttribute('ErrorOccured')
|
|
errorMessage = xmlDoc.documentElement.getAttribute('ErrorMessage')
|
|
filename = xmlDoc.documentElement.getAttribute('Filename')
|
|
header = xmlDoc.documentElement.getAttribute('Header')
|
|
isPdf = xmlDoc.documentElement.getAttribute('IsPDF')
|
|
version = xmlDoc.documentElement.getAttribute('Version')
|
|
entropy = xmlDoc.documentElement.getAttribute('Entropy')
|
|
|
|
#extra data
|
|
countEof = xmlDoc.documentElement.getAttribute('CountEOF')
|
|
countChatAfterLastEof = xmlDoc.documentElement.getAttribute('CountCharsAfterLastEOF')
|
|
totalEntropy = xmlDoc.documentElement.getAttribute('TotalEntropy')
|
|
streamEntropy = xmlDoc.documentElement.getAttribute('StreamEntropy')
|
|
nonStreamEntropy = xmlDoc.documentElement.getAttribute('NonStreamEntropy')
|
|
|
|
keywords = []
|
|
dates = []
|
|
|
|
#grab all keywords
|
|
for node in xmlDoc.documentElement.getElementsByTagName('Keywords')[0].childNodes:
|
|
name = node.getAttribute('Name')
|
|
count = int(node.getAttribute('Count'))
|
|
if int(node.getAttribute('HexcodeCount')) > 0:
|
|
hexCount = int(node.getAttribute('HexcodeCount'))
|
|
else:
|
|
hexCount = 0
|
|
keyword = { 'count':count, 'hexcodecount':hexCount, 'name':name }
|
|
keywords.append(keyword)
|
|
|
|
#grab all date information
|
|
for node in xmlDoc.documentElement.getElementsByTagName('Dates')[0].childNodes:
|
|
name = node.getAttribute('Name')
|
|
value = node.getAttribute('Value')
|
|
date = { 'name':name, 'value':value }
|
|
dates.append(date)
|
|
|
|
data = { 'countEof':countEof, 'countChatAfterLastEof':countChatAfterLastEof, 'totalEntropy':totalEntropy, 'streamEntropy':streamEntropy, 'nonStreamEntropy':nonStreamEntropy, 'errorOccured':errorOccured, 'errorMessage':errorMessage, 'filename':filename, 'header':header, 'isPdf':isPdf, 'version':version, 'entropy':entropy, 'keywords': { 'keyword': keywords }, 'dates': { 'date':dates} }
|
|
complete = [ { 'pdfid' : data} ]
|
|
result = json.dumps(complete)
|
|
return result
|
|
|
|
def Main():
|
|
oParser = optparse.OptionParser(usage='usage: %prog [options] [pdf-file|zip-file|url]\n' + __description__, version='%prog ' + __version__)
|
|
oParser.add_option('-s', '--scan', action='store_true', default=False, help='scan the given directory')
|
|
oParser.add_option('-a', '--all', action='store_true', default=False, help='display all the names')
|
|
oParser.add_option('-e', '--extra', action='store_true', default=False, help='display extra data, like dates')
|
|
oParser.add_option('-f', '--force', action='store_true', default=False, help='force the scan of the file, even without proper %PDF header')
|
|
oParser.add_option('-d', '--disarm', action='store_true', default=False, help='disable JavaScript and auto launch')
|
|
(options, args) = oParser.parse_args()
|
|
|
|
if len(args) == 0:
|
|
if options.disarm:
|
|
print('Option disarm not supported with stdin')
|
|
options.disarm = False
|
|
print(PDFiD2String(PDFiD('', options.all, options.extra, options.disarm, options.force), options.force))
|
|
elif len(args) == 1:
|
|
if options.scan:
|
|
Scan(args[0], options.all, options.extra, options.disarm, options.force)
|
|
else:
|
|
print(PDFiD2String(PDFiD(args[0], options.all, options.extra, options.disarm, options.force), options.force))
|
|
else:
|
|
oParser.print_help()
|
|
print('')
|
|
print(' %s' % __description__)
|
|
print(' Source code put in the public domain by Didier Stevens, no Copyright')
|
|
print(' Use at your own risk')
|
|
print(' https://DidierStevens.com')
|
|
return
|
|
|
|
if __name__ == '__main__':
|
|
Main()
|