#!/usr/bin/env python3 from datetime import datetime, timezone from multiprocessing import Pool, Lock import csv import datetime as dt import re import sqlite3 import subprocess import sys import time GLOBAL_DB = None # has to be a global because pickling :-( MASTER_CSV = 'master.csv' SECUREDROP_CSV = 'securedrop-api.csv' DB_FILENAME = 'fetch.sqlite3' SOCKS_PROXY = 'socks5h://127.0.0.1:9050/' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' BADNESS = 900 CURL_TIMEOUT = 120 RETRY_SLEEP = 60 RETRY_LIMIT = 6 PLACEHOLDER = '-' POOL_WORKERS = 10 DETECTOR_HISTORY=14 TRUE_STRING = 'TRUE' FOOTNOTES = 'Footnotes' DEFERRED_CATEGORIES = ( # stuff to push down the page due to size 'Globaleaks', 'SecureDrop', ) EMOJI_HTTP = ':small_red_triangle: **HTTP**' EMOJI_HTTPS = ':closed_lock_with_key: **HTTPS**' EMOJI_UNSET = ':question:' EMOJI_2xx = ':white_check_mark:' EMOJI_3xx = ':eight_spoked_asterisk:' EMOJI_4xx = ':no_entry_sign:' EMOJI_5xx = ':stop_sign:' EMOJI_BAD_CERT = ':key:' EMOJI_DEAD = ':sos:' EMOJI_NO_CONN = ':exclamation:' EMOJI_NO_DATA = ':new:' EMOJI_NO_DESC = ':question:' EMOJI_CONN_TIMEOUT = ':alarm_clock:' EMOJI_TTL_TIMEOUT = ':timer_clock:' H1 = '#' H2 = '##' H3 = '###' H4 = '####' B = '*' BB = ' *' BBB = ' *' LINE = '----' INDEXJUMP = ':arrow_up: [return to top index](#index)' SCHEMA_SQL = ''' PRAGMA journal_mode = wal; PRAGMA foreign_keys = ON; PRAGMA encoding = "UTF-8"; BEGIN TRANSACTION; CREATE TABLE IF NOT EXISTS fetches ( id INTEGER PRIMARY KEY NOT NULL, ctime INTEGER DEFAULT (CAST(strftime('%s','now') AS INTEGER)) NOT NULL, run TEXT NOT NULL, url TEXT NOT NULL, attempt INTEGER NOT NULL, http_code INTEGER NOT NULL, curl_exit INTEGER NOT NULL, out TEXT NOT NULL, err TEXT NOT NULL ); PRAGMA user_version = 1; COMMIT; ''' INSERT_SQL = ''' INSERT INTO fetches (run, url, attempt, out, err, http_code, curl_exit) VALUES (:run, :url, :attempt, :out, :err, :http_code, :curl_exit) ''' SUMMARY_SQL = ''' SELECT foo.ctime, foo.attempt, foo.http_code, foo.curl_exit, foo.err FROM fetches foo INNER JOIN ( SELECT url, run, MAX(attempt) AS pivot FROM fetches WHERE url = :url GROUP BY url, run ) bar ON foo.url = bar.url AND foo.run = bar.run AND foo.attempt = bar.pivot ORDER BY ctime DESC LIMIT :limit ''' TRASH_SQL = ''' DELETE FROM fetches WHERE ctime < (CAST(strftime('%s', (SELECT DATETIME('now', '-30 day'))) AS INTEGER)); ''' def extract_hcode(s): # static if s == None: return BADNESS + 1 lines = s.splitlines() if len(lines) == 0: return BADNESS + 2 fields = lines[0].split() if len(fields) < 2: return BADNESS + 3 try: code = int(fields[1]) except: code = BADNESS + 4 return code def placeholder(s): if s == '': return PLACEHOLDER if s == None: return PLACEHOLDER return s def unicode_cleanup(x): x = placeholder(x) # canonicalise blanks and None if isinstance(x, str): # native python3 utf-8 string result = x else: # is byte array result = x.decode('utf-8', 'ignore') return result class Database: def __init__(self, filename): self.connection = sqlite3.connect(filename) self.connection.text_factory = lambda x: unicode_cleanup(x) self.cursor = self.connection.cursor() self.cursor.executescript(SCHEMA_SQL) self.now = time.strftime('%Y%m%d%H%M%S', time.gmtime()) self.lock = Lock() def commit(self): self.connection.commit() def close(self): self.commit() self.connection.close() def summary(self, url, limit=DETECTOR_HISTORY): params = { 'url': url, 'limit': limit } rows = self.cursor.execute(SUMMARY_SQL, params) return rows.fetchall() def insert(self, rowhash): rowhash['run'] = self.now self.lock.acquire() # BEGIN PRIVILEGED CODE self.cursor.execute(INSERT_SQL, rowhash) self.commit() self.lock.release() # END PRIVILEGED CODE def trash(self): self.lock.acquire() # BEGIN PRIVILEGED CODE result = self.cursor.execute(TRASH_SQL) self.commit() self.lock.release() # END PRIVILEGED CODE return result.fetchall() class URL: def __init__(self, url): if not (url.startswith('http://') or url.startswith('https://')): raise RuntimeError('not a proper url: ' + url) self.url = url self.attempt = 0 self.last_code = None def fetch1(self): args = [ 'curl', '--head', '--user-agent', USER_AGENT, '--proxy', SOCKS_PROXY, self.url ] time.sleep(1) # slight breathing space because MP try: p = subprocess.Popen(args, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # todo: text=True (out, err) = p.communicate(timeout=CURL_TIMEOUT) hcode = extract_hcode(str(out)) # str() not needed if text=True if hcode == 200: err = PLACEHOLDER ecode = p.returncode except subprocess.TimeoutExpired as e: (out, err) = (PLACEHOLDER, str(e)) hcode = BADNESS + 10 ecode = BADNESS + 10 self.last_code = hcode self.attempt += 1 GLOBAL_DB.insert(dict( url=self.url, attempt=self.attempt, out=out, err=err, http_code=hcode, curl_exit=ecode, )) def fetchwrap(self): for i in range(RETRY_LIMIT): self.fetch1() print('try{0}: {1} {2}'.format(i, self.url, self.last_code)) if self.last_code < BADNESS: return time.sleep(RETRY_SLEEP) def caps(s): return ' '.join([w.capitalize() for w in s.lower().split()]) def deferred(s): return s in DEFERRED_CATEGORIES def get_categories(chunk): src = sorted(set([x['category'] for x in chunk])) dst = [ x for x in src if not deferred(x) ] dst.extend([ x for x in src if deferred(x) ]) return dst def get_placeholder(row, k): return placeholder(row.get(k, '')) def sort_using(chunk, k): return sorted(chunk, key=lambda x: x[k]) def grep_using(chunk, k, v, invert=False): if invert: return [ x for x in chunk if x.get(k, '') != v ] else: return [ x for x in chunk if x.get(k, '') == v ] def get_proof(row): url = get_placeholder(row, 'proof_url') if url == '-': return ':crystal_ball: to be confirmed' if url == 'tbc': return ':crystal_ball: to be confirmed' if url == 'ssl': return ':lock: see tls/ssl certificate' if url == 'header': return ':mag: see onion-location header' return '[link]({})'.format(url) def get_summary(url): rows = GLOBAL_DB.summary(url) if len(rows) == 0: return ( EMOJI_NO_DATA, ) result = [] for when, attempt, hcode, ecode, errstr in rows: errstr = unicode_cleanup(errstr) # THIS SHOULD NOT BE NEEDED, WHY? PERHAPS BECAUSE MULTI-LINE? emoji = EMOJI_UNSET if hcode >= 200 and hcode < 300: emoji = EMOJI_2xx elif hcode >= 300 and hcode < 400: emoji = EMOJI_3xx elif hcode >= 400 and hcode < 500: emoji = EMOJI_4xx elif hcode >= 500 and hcode < 600: emoji = EMOJI_5xx elif hcode >= BADNESS: emoji = EMOJI_DEAD # default if 'SSL certificate' in errstr: emoji = EMOJI_BAD_CERT elif 'timed out' in errstr: emoji = EMOJI_CONN_TIMEOUT elif "Can't complete SOCKS5 connection" in errstr: # todo: parse out socks error codes from https://datatracker.ietf.org/doc/html/rfc1928#section-6 if re.search(r'\(1\)$', errstr): emoji = EMOJI_NO_CONN elif re.search(r'\(4\)$', errstr): emoji = EMOJI_NO_DESC elif re.search(r'\(6\)$', errstr): emoji = EMOJI_TTL_TIMEOUT t = datetime.fromtimestamp(when, timezone.utc) result.append('{0}'.format(emoji, attempt, hcode, ecode, t)) return result def print_chunk(chunk, title, description=None, print_bar=True): print(LINE) print(H2, title) # was: caps(title) print() if description: print(description) print() for row in sort_using(chunk, 'site_name'): url = row['onion_url'] padlock = EMOJI_HTTPS if url.startswith('https') else EMOJI_HTTP print(H3, '[{site_name}]({onion_url})'.format(**row)) comment = get_placeholder(row, 'comment') if comment != '-': print('*{}*'.format(comment)) # short name oname = row['onion_name'] if oname != '': print(B, 'short: `{0}`'.format(oname)) # transport print(B, 'transport:', padlock) # linky-linky print(B, 'link: [{0}]({0})'.format(url)) # apparently some people like copying and pasting plain text print(B, 'plain: `{0}`'.format(url)) # print proof unconditionally, as encouragement to fix it print(B, 'proof: {0}'.format(get_proof(row))) if print_bar: bar = ''.join(get_summary(url)) print(B, 'check:', bar) print() print(INDEXJUMP) print() def poolhook(x): x.fetchwrap() def do_fetch(master): chunk = grep_using(master, 'flaky', TRUE_STRING, invert=True) work = [ URL(x['onion_url']) for x in chunk ] with Pool(POOL_WORKERS) as p: p.map(poolhook, work) def print_index(cats): print(LINE) print(H1, 'Index') print() for cat in cats: print(B, '[{0}](#{1})'.format(cat, # was: caps(cat) cat.lower().replace(' ', '-'))) print(B, '[{0}](#{1})'.format(FOOTNOTES, FOOTNOTES.lower())) print() def do_print(master): cats = get_categories(master) print_index(cats) for cat in cats: chunk = grep_using(master, 'category', cat) chunk = grep_using(chunk, 'flaky', TRUE_STRING, invert=True) print_chunk(chunk, cat) flaky = grep_using(master, 'flaky', TRUE_STRING) print_chunk(flaky, 'Flaky Sites', description='These sites have apparently stopped responding.', print_bar=False) def do_trash(): for x in GLOBAL_DB.trash(): print('trash:', x) if __name__ == '__main__': master = None with open(MASTER_CSV, 'r') as fh: dr = csv.DictReader(fh) master = [ x for x in dr ] with open(SECUREDROP_CSV, 'r') as fh: dr = csv.DictReader(fh) master.extend([ x for x in dr ]) GLOBAL_DB = Database(DB_FILENAME) for arg in sys.argv[1:]: if arg == 'fetch': do_fetch(master) if arg == 'print': do_print(master) if arg == 'trash': do_trash() GLOBAL_DB.close()