#!/usr/bin/env python3 import re import csv import pprint import requests from bs4 import BeautifulSoup url = 'https://crt.sh/?dNSName=%25.onion&exclude=expired&match=ILIKE' session = requests.Session() response = session.get(url) results = [] onion_re = re.compile(r'[2-7a-z]{16}([2-7a-z]{40})?\.onion(\s|$)') status = response.status_code if status != 200: raise RuntimeError('http status: {}'.format(status)) html_doc = response.text soup = BeautifulSoup(html_doc, 'html.parser') table = soup.find_all('table')[2] table_rows = table.find_all('tr') for tr in table_rows: # skip header for br in tr.find_all("br"): br.replace_with(" ") td = tr.find_all('td') fields = [i.text for i in td] if not fields: continue result = dict() result['id'] = fields[0] result['at'] = fields[1] result['nb'] = fields[2] result['na'] = fields[3] result['cn'] = fields[4] result['san'] = fields[5] result['in'] = fields[6] results.append(result) done = dict() for r in results: # pprint.pprint(r) dates = '{0} not_before={1} not_after={2}'.format(r['at'], r['nb'], r['na']) sans = r['cn'].split() ca_data = [ x.strip() for x in r['in'].lower().split(',') ] ca = [x for x in ca_data if x.startswith('cn=')][0][3:] sans.extend(r['san'].split()) for san in sans: if not onion_re.search(san): continue if done.get(san, False): continue done[san] = True if re.match(r'\*', san): print('* `{san}`\n * {dates} **{ca}**'.format(san=san, dates=dates, ca=ca)) else: print('* [`{san}`](https://{san})\n * {dates} **{ca}**'.format(san=san, dates=dates, ca=ca))