real-world-onion-sites/rwos-db.py

357 lines
11 KiB
Python
Raw Normal View History

2019-11-16 09:24:14 -05:00
#!/usr/bin/env python3
from datetime import datetime, timezone
from multiprocessing import Pool, Lock
import csv
import datetime as dt
import re
2019-11-16 09:24:14 -05:00
import sqlite3
import subprocess
import sys
import time
GLOBAL_DB = None # has to be a global because pickling :-(
MASTER_CSV = 'master.csv'
2021-06-01 17:04:28 -04:00
SECUREDROP_CSV = 'securedrop-api.csv'
2019-11-16 09:24:14 -05:00
DB_FILENAME = 'fetch.sqlite3'
SOCKS_PROXY = 'socks5h://127.0.0.1:9050/'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
2019-11-16 09:24:14 -05:00
BADNESS = 900
CURL_TIMEOUT = 120
RETRY_SLEEP = 60
2019-11-16 16:34:36 -05:00
RETRY_LIMIT = 6
2019-11-16 09:24:14 -05:00
PLACEHOLDER = '-'
POOL_WORKERS = 10
2019-11-21 08:06:02 -05:00
DETECTOR_HISTORY=14
2021-06-01 05:43:02 -04:00
TRUE_STRING = 'TRUE'
FOOTNOTES = 'Footnotes'
2019-11-16 09:24:14 -05:00
2019-11-21 15:34:17 -05:00
DEFERRED_CATEGORIES = ( # stuff to push down the page due to size
'Globaleaks',
'SecureDrop',
2019-11-21 15:34:17 -05:00
)
2022-03-06 23:42:49 -05:00
EMOJI_HTTP = ':small_red_triangle: **HTTP**'
EMOJI_HTTPS = ':closed_lock_with_key: **HTTPS**'
2019-11-16 09:24:14 -05:00
EMOJI_UNSET = ':question:'
EMOJI_2xx = ':white_check_mark:'
EMOJI_3xx = ':eight_spoked_asterisk:'
2019-11-21 10:54:44 -05:00
EMOJI_4xx = ':no_entry_sign:'
EMOJI_5xx = ':stop_sign:'
EMOJI_BAD_CERT = ':key:'
2019-11-16 09:24:14 -05:00
EMOJI_DEAD = ':sos:'
EMOJI_NO_CONN = ':exclamation:'
2019-11-17 05:36:38 -05:00
EMOJI_NO_DATA = ':new:'
EMOJI_NO_DESC = ':question:'
EMOJI_CONN_TIMEOUT = ':alarm_clock:'
EMOJI_TTL_TIMEOUT = ':timer_clock:'
2019-11-16 09:24:14 -05:00
H1 = '#'
H2 = '##'
H3 = '###'
H4 = '####'
B = '*'
BB = ' *'
BBB = ' *'
LINE = '----'
2022-03-06 17:02:39 -05:00
INDEXJUMP = ':arrow_up: [return to top index](#index)'
2019-11-16 09:24:14 -05:00
SCHEMA_SQL = '''
PRAGMA journal_mode = wal;
PRAGMA foreign_keys = ON;
PRAGMA encoding = "UTF-8";
BEGIN TRANSACTION;
CREATE TABLE IF NOT EXISTS fetches (
id INTEGER PRIMARY KEY NOT NULL,
ctime INTEGER DEFAULT (CAST(strftime('%s','now') AS INTEGER)) NOT NULL,
run TEXT NOT NULL,
url TEXT NOT NULL,
attempt INTEGER NOT NULL,
http_code INTEGER NOT NULL,
curl_exit INTEGER NOT NULL,
out TEXT NOT NULL,
err TEXT NOT NULL
);
PRAGMA user_version = 1;
COMMIT;
'''
INSERT_SQL = '''
INSERT INTO
fetches (run, url, attempt, out, err, http_code, curl_exit)
VALUES (:run, :url, :attempt, :out, :err, :http_code, :curl_exit)
'''
SUMMARY_SQL = '''
SELECT foo.ctime, foo.attempt, foo.http_code, foo.curl_exit, foo.err
FROM fetches foo
INNER JOIN (
SELECT url, run, MAX(attempt) AS pivot
FROM fetches
WHERE url = :url
GROUP BY url, run
) bar
ON foo.url = bar.url
AND foo.run = bar.run
AND foo.attempt = bar.pivot
2019-11-16 09:24:14 -05:00
ORDER BY ctime DESC
LIMIT :limit
'''
TRASH_SQL = '''
DELETE
FROM fetches
WHERE ctime < (CAST(strftime('%s', (SELECT DATETIME('now', '-30 day'))) AS INTEGER));
'''
2019-11-16 09:24:14 -05:00
def extract_hcode(s): # static
if s == None:
return BADNESS + 1
lines = s.splitlines()
if len(lines) == 0:
return BADNESS + 2
fields = lines[0].split()
if len(fields) < 2:
return BADNESS + 3
try:
code = int(fields[1])
except:
code = BADNESS + 4
return code
def placeholder(s):
if s == '': return PLACEHOLDER
if s == None: return PLACEHOLDER
return s
def unicode_cleanup(x):
x = placeholder(x) # canonicalise blanks and None
if isinstance(x, str): # native python3 utf-8 string
result = x
else: # is byte array
result = x.decode('utf-8', 'ignore')
return result
2019-11-16 09:24:14 -05:00
class Database:
def __init__(self, filename):
self.connection = sqlite3.connect(filename)
self.connection.text_factory = lambda x: unicode_cleanup(x)
2019-11-16 09:24:14 -05:00
self.cursor = self.connection.cursor()
self.cursor.executescript(SCHEMA_SQL)
self.now = time.strftime('%Y%m%d%H%M%S', time.gmtime())
self.lock = Lock()
def commit(self):
self.connection.commit()
def close(self):
self.commit()
self.connection.close()
2019-11-21 08:06:02 -05:00
def summary(self, url, limit=DETECTOR_HISTORY):
2019-11-16 09:24:14 -05:00
params = { 'url': url, 'limit': limit }
rows = self.cursor.execute(SUMMARY_SQL, params)
return rows.fetchall()
def insert(self, rowhash):
rowhash['run'] = self.now
self.lock.acquire() # BEGIN PRIVILEGED CODE
self.cursor.execute(INSERT_SQL, rowhash)
self.commit()
self.lock.release() # END PRIVILEGED CODE
def trash(self):
self.lock.acquire() # BEGIN PRIVILEGED CODE
result = self.cursor.execute(TRASH_SQL)
self.commit()
self.lock.release() # END PRIVILEGED CODE
return result.fetchall()
2019-11-16 09:24:14 -05:00
class URL:
def __init__(self, url):
2019-11-16 14:55:57 -05:00
if not (url.startswith('http://') or url.startswith('https://')):
raise RuntimeError('not a proper url: ' + url)
2019-11-16 09:24:14 -05:00
self.url = url
self.attempt = 0
self.last_code = None
def fetch1(self):
args = [ 'curl', '--head', '--user-agent', USER_AGENT, '--proxy', SOCKS_PROXY, self.url ]
time.sleep(1) # slight breathing space because MP
2019-11-16 09:24:14 -05:00
try:
p = subprocess.Popen(args, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # todo: text=True
2019-11-16 09:24:14 -05:00
(out, err) = p.communicate(timeout=CURL_TIMEOUT)
hcode = extract_hcode(str(out)) # str() not needed if text=True
2019-11-16 09:24:14 -05:00
if hcode == 200: err = PLACEHOLDER
ecode = p.returncode
except subprocess.TimeoutExpired as e:
(out, err) = (PLACEHOLDER, str(e))
hcode = BADNESS + 10
ecode = BADNESS + 10
self.last_code = hcode
self.attempt += 1
GLOBAL_DB.insert(dict(
url=self.url,
attempt=self.attempt,
out=out,
err=err,
http_code=hcode,
curl_exit=ecode,
))
def fetchwrap(self):
2019-11-16 16:34:36 -05:00
for i in range(RETRY_LIMIT):
2019-11-16 09:24:14 -05:00
self.fetch1()
print('try{0}: {1} {2}'.format(i, self.url, self.last_code))
if self.last_code < BADNESS: return
time.sleep(RETRY_SLEEP)
def caps(s):
return ' '.join([w.capitalize() for w in s.lower().split()])
2019-11-16 16:13:33 -05:00
def deferred(s):
2019-11-21 15:34:17 -05:00
return s in DEFERRED_CATEGORIES
2019-11-16 16:13:33 -05:00
2019-11-16 09:24:14 -05:00
def get_categories(chunk):
2019-11-16 16:13:33 -05:00
src = sorted(set([x['category'] for x in chunk]))
dst = [ x for x in src if not deferred(x) ]
dst.extend([ x for x in src if deferred(x) ])
return dst
2019-11-16 09:24:14 -05:00
def get_placeholder(row, k):
return placeholder(row.get(k, ''))
def sort_using(chunk, k):
return sorted(chunk, key=lambda x: x[k])
def grep_using(chunk, k, v, invert=False):
if invert:
return [ x for x in chunk if x.get(k, '') != v ]
else:
return [ x for x in chunk if x.get(k, '') == v ]
def get_proof(row):
url = get_placeholder(row, 'proof_url')
if url == '-': return ':crystal_ball: to be confirmed'
if url == 'tbc': return ':crystal_ball: to be confirmed'
if url == 'ssl': return ':lock: see tls/ssl certificate'
if url == 'header': return ':mag: see onion-location header'
2019-11-16 16:05:31 -05:00
return '[link]({})'.format(url)
2019-11-16 09:24:14 -05:00
def get_summary(url):
rows = GLOBAL_DB.summary(url)
if len(rows) == 0:
return ( EMOJI_NO_DATA, )
2019-11-16 09:24:14 -05:00
result = []
for when, attempt, hcode, ecode, errstr in rows:
errstr = unicode_cleanup(errstr) # THIS SHOULD NOT BE NEEDED, WHY? PERHAPS BECAUSE MULTI-LINE?
2019-11-16 09:24:14 -05:00
emoji = EMOJI_UNSET
2019-11-21 07:33:31 -05:00
if hcode >= 200 and hcode < 300:
2019-11-16 09:24:14 -05:00
emoji = EMOJI_2xx
2019-11-21 07:33:31 -05:00
elif hcode >= 300 and hcode < 400:
2019-11-16 09:24:14 -05:00
emoji = EMOJI_3xx
2019-11-21 07:33:31 -05:00
elif hcode >= 400 and hcode < 500:
2019-11-16 09:24:14 -05:00
emoji = EMOJI_4xx
2019-11-21 07:33:31 -05:00
elif hcode >= 500 and hcode < 600:
2019-11-16 09:24:14 -05:00
emoji = EMOJI_5xx
2019-11-21 07:33:31 -05:00
elif hcode >= BADNESS:
emoji = EMOJI_DEAD # default
if 'SSL certificate' in errstr:
emoji = EMOJI_BAD_CERT
elif 'timed out' in errstr:
emoji = EMOJI_CONN_TIMEOUT
elif "Can't complete SOCKS5 connection" in errstr:
# todo: parse out socks error codes from https://datatracker.ietf.org/doc/html/rfc1928#section-6
if re.search(r'\(1\)$', errstr):
emoji = EMOJI_NO_CONN
elif re.search(r'\(4\)$', errstr):
emoji = EMOJI_NO_DESC
elif re.search(r'\(6\)$', errstr):
emoji = EMOJI_TTL_TIMEOUT
2019-11-16 09:24:14 -05:00
t = datetime.fromtimestamp(when, timezone.utc)
2019-11-21 07:33:31 -05:00
result.append('<span title="attempts={1} code={2} exit={3} time={4}">{0}</span>'.format(emoji, attempt, hcode, ecode, t))
2019-11-16 09:24:14 -05:00
return result
def print_chunk(chunk, title, description=None, print_bar=True):
2019-11-16 09:24:14 -05:00
print(LINE)
print(H2, title) # was: caps(title)
2019-11-16 09:24:14 -05:00
print()
if description:
print(description)
print()
2019-11-16 09:24:14 -05:00
for row in sort_using(chunk, 'site_name'):
url = row['onion_url']
padlock = EMOJI_HTTPS if url.startswith('https') else EMOJI_HTTP
2022-03-06 23:41:24 -05:00
print(H3, '[{site_name}]({onion_url})'.format(**row))
2019-11-16 09:24:14 -05:00
comment = get_placeholder(row, 'comment')
2019-11-16 16:02:54 -05:00
if comment != '-': print('*{}*'.format(comment))
# short name
oname = row['onion_name']
if oname != '': print(B, 'short: `{0}`'.format(oname))
2022-03-08 12:54:04 -05:00
# transport
print(B, 'transport:', padlock)
# linky-linky
print(B, 'link: [{0}]({0})'.format(url))
# apparently some people like copying and pasting plain text
print(B, 'plain: `{0}`'.format(url))
2019-11-16 09:24:14 -05:00
# print proof unconditionally, as encouragement to fix it
2019-11-16 16:05:31 -05:00
print(B, 'proof: {0}'.format(get_proof(row)))
2019-11-16 09:24:14 -05:00
if print_bar:
2019-11-17 04:05:41 -05:00
bar = ''.join(get_summary(url))
2019-11-23 04:28:50 -05:00
print(B, 'check:', bar)
2019-11-16 09:24:14 -05:00
print()
2022-03-06 23:45:48 -05:00
print(INDEXJUMP)
2022-03-06 17:01:32 -05:00
print()
2019-11-16 09:24:14 -05:00
def poolhook(x):
x.fetchwrap()
def do_fetch(master):
2021-06-01 05:43:02 -04:00
chunk = grep_using(master, 'flaky', TRUE_STRING, invert=True)
2019-11-16 09:24:14 -05:00
work = [ URL(x['onion_url']) for x in chunk ]
with Pool(POOL_WORKERS) as p: p.map(poolhook, work)
def print_index(cats):
print(LINE)
print(H1, 'Index')
print()
for cat in cats:
print(B, '[{0}](#{1})'.format(cat, # was: caps(cat)
cat.lower().replace(' ', '-')))
print(B, '[{0}](#{1})'.format(FOOTNOTES, FOOTNOTES.lower()))
2019-11-16 09:24:14 -05:00
print()
def do_print(master):
cats = get_categories(master)
print_index(cats)
for cat in cats:
chunk = grep_using(master, 'category', cat)
2021-06-01 05:43:02 -04:00
chunk = grep_using(chunk, 'flaky', TRUE_STRING, invert=True)
2019-11-16 09:24:14 -05:00
print_chunk(chunk, cat)
2021-06-01 05:43:02 -04:00
flaky = grep_using(master, 'flaky', TRUE_STRING)
print_chunk(flaky, 'Flaky Sites', description='These sites have apparently stopped responding.', print_bar=False)
2019-11-16 09:24:14 -05:00
def do_trash():
for x in GLOBAL_DB.trash():
print('trash:', x)
2019-11-16 09:24:14 -05:00
if __name__ == '__main__':
master = None
with open(MASTER_CSV, 'r') as fh:
dr = csv.DictReader(fh)
master = [ x for x in dr ]
2021-06-01 17:04:28 -04:00
with open(SECUREDROP_CSV, 'r') as fh:
dr = csv.DictReader(fh)
master.extend([ x for x in dr ])
2019-11-16 09:24:14 -05:00
GLOBAL_DB = Database(DB_FILENAME)
for arg in sys.argv[1:]:
if arg == 'fetch': do_fetch(master)
if arg == 'print': do_print(master)
if arg == 'trash': do_trash()
2019-11-16 09:24:14 -05:00
GLOBAL_DB.close()