real-world-onion-sites/rwos-db.py

286 lines
8.1 KiB
Python
Raw Normal View History

2019-11-16 09:24:14 -05:00
#!/usr/bin/env python3
from datetime import datetime, timezone
from multiprocessing import Pool, Lock
import csv
import datetime as dt
import sqlite3
import subprocess
import sys
import time
GLOBAL_DB = None # has to be a global because pickling :-(
MASTER_CSV = 'master.csv'
DB_FILENAME = 'fetch.sqlite3'
SOCKS_PROXY = 'socks5h://127.0.0.1:9150/'
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0"
BADNESS = 900
CURL_TIMEOUT = 120
RETRY_SLEEP = 60
2019-11-16 16:34:36 -05:00
RETRY_LIMIT = 6
2019-11-16 09:24:14 -05:00
PLACEHOLDER = '-'
POOL_WORKERS = 10
2019-11-16 09:24:14 -05:00
YES = 'y'
EMOJI_HTTP = ':wrench:'
EMOJI_HTTPS = ':closed_lock_with_key:'
2019-11-16 09:24:14 -05:00
EMOJI_UNSET = ':question:'
EMOJI_2xx = ':white_check_mark:'
EMOJI_3xx = ':arrow_right:'
EMOJI_4xx = ':negative_squared_cross_mark:'
EMOJI_5xx = ':red_circle:'
EMOJI_DEAD = ':sos:'
EMOJI_NO_DATA = ':interrobang:'
H1 = '#'
H2 = '##'
H3 = '###'
H4 = '####'
B = '*'
BB = ' *'
BBB = ' *'
LINE = '----'
SCHEMA_SQL = '''
PRAGMA journal_mode = wal;
PRAGMA foreign_keys = ON;
PRAGMA encoding = "UTF-8";
BEGIN TRANSACTION;
CREATE TABLE IF NOT EXISTS fetches (
id INTEGER PRIMARY KEY NOT NULL,
ctime INTEGER DEFAULT (CAST(strftime('%s','now') AS INTEGER)) NOT NULL,
run TEXT NOT NULL,
url TEXT NOT NULL,
attempt INTEGER NOT NULL,
http_code INTEGER NOT NULL,
curl_exit INTEGER NOT NULL,
out TEXT NOT NULL,
err TEXT NOT NULL
);
PRAGMA user_version = 1;
COMMIT;
'''
INSERT_SQL = '''
INSERT INTO
fetches (run, url, attempt, out, err, http_code, curl_exit)
VALUES (:run, :url, :attempt, :out, :err, :http_code, :curl_exit)
'''
SUMMARY_SQL = '''
SELECT foo.ctime, foo.attempt, foo.http_code
FROM fetches foo
INNER JOIN (
SELECT url, run, MAX(attempt) AS pivot
FROM fetches
WHERE url = :url
GROUP BY url, run
) bar
ON foo.url = bar.url
AND foo.run = bar.run
AND foo.attempt = bar.pivot
2019-11-16 09:24:14 -05:00
ORDER BY ctime DESC
LIMIT :limit
'''
def extract_hcode(s): # static
if s == None:
return BADNESS + 1
lines = s.splitlines()
if len(lines) == 0:
return BADNESS + 2
fields = lines[0].split()
if len(fields) < 2:
return BADNESS + 3
try:
code = int(fields[1])
except:
code = BADNESS + 4
return code
class Database:
def __init__(self, filename):
self.connection = sqlite3.connect(filename)
self.connection.text_factory = lambda x: unicode(x, UTF8, 'ignore') # ignore bad unicode shit
self.cursor = self.connection.cursor()
self.cursor.executescript(SCHEMA_SQL)
self.now = time.strftime('%Y%m%d%H%M%S', time.gmtime())
self.lock = Lock()
def commit(self):
self.connection.commit()
def close(self):
self.commit()
self.connection.close()
def summary(self, url, limit=10):
params = { 'url': url, 'limit': limit }
rows = self.cursor.execute(SUMMARY_SQL, params)
return rows.fetchall()
def insert(self, rowhash):
rowhash['run'] = self.now
self.lock.acquire() # BEGIN PRIVILEGED CODE
self.cursor.execute(INSERT_SQL, rowhash)
self.commit()
self.lock.release() # END PRIVILEGED CODE
class URL:
def __init__(self, url):
2019-11-16 14:55:57 -05:00
if not (url.startswith('http://') or url.startswith('https://')):
raise RuntimeError('not a proper url: ' + url)
2019-11-16 09:24:14 -05:00
self.url = url
self.attempt = 0
self.last_code = None
def fetch1(self):
args = [ 'curl', '--head', '--user-agent', USER_AGENT, '--proxy', SOCKS_PROXY, self.url ]
time.sleep(1) # slight breathing space because MP
2019-11-16 09:24:14 -05:00
try:
p = subprocess.Popen(args, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # todo: text=True
2019-11-16 09:24:14 -05:00
(out, err) = p.communicate(timeout=CURL_TIMEOUT)
hcode = extract_hcode(str(out)) # str() not needed if text=True
2019-11-16 09:24:14 -05:00
if hcode == 200: err = PLACEHOLDER
ecode = p.returncode
except subprocess.TimeoutExpired as e:
(out, err) = (PLACEHOLDER, str(e))
hcode = BADNESS + 10
ecode = BADNESS + 10
self.last_code = hcode
self.attempt += 1
GLOBAL_DB.insert(dict(
url=self.url,
attempt=self.attempt,
out=out,
err=err,
http_code=hcode,
curl_exit=ecode,
))
def fetchwrap(self):
2019-11-16 16:34:36 -05:00
for i in range(RETRY_LIMIT):
2019-11-16 09:24:14 -05:00
self.fetch1()
print('try{0}: {1} {2}'.format(i, self.url, self.last_code))
if self.last_code < BADNESS: return
time.sleep(RETRY_SLEEP)
def placeholder(s):
if s == '': return PLACEHOLDER
if s == None: return PLACEHOLDER
return s
def caps(s):
return ' '.join([w.capitalize() for w in s.lower().split()])
2019-11-16 16:13:33 -05:00
def deferred(s):
return s.startswith('globaleaks') or s.startswith('securedrop')
2019-11-16 09:24:14 -05:00
def get_categories(chunk):
2019-11-16 16:13:33 -05:00
src = sorted(set([x['category'] for x in chunk]))
dst = [ x for x in src if not deferred(x) ]
dst.extend([ x for x in src if deferred(x) ])
return dst
2019-11-16 09:24:14 -05:00
def get_placeholder(row, k):
return placeholder(row.get(k, ''))
def sort_using(chunk, k):
return sorted(chunk, key=lambda x: x[k])
def grep_using(chunk, k, v, invert=False):
if invert:
return [ x for x in chunk if x.get(k, '') != v ]
else:
return [ x for x in chunk if x.get(k, '') == v ]
def get_proof(row):
url = get_placeholder(row, 'proof_url')
2019-11-16 16:05:31 -05:00
if url == '-': return 'to be done'
if url == 'ssl': return 'see tls/ssl certificate'
return '[link]({})'.format(url)
2019-11-16 09:24:14 -05:00
def get_summary(url):
rows = GLOBAL_DB.summary(url)
if len(rows) == 0:
return ( EMOJI_NO_DATA, )
2019-11-16 09:24:14 -05:00
result = []
for when, attempt, code in rows:
emoji = EMOJI_UNSET
if code >= 200 and code < 300:
emoji = EMOJI_2xx
elif code >= 300 and code < 400:
emoji = EMOJI_3xx
elif code >= 400 and code < 500:
emoji = EMOJI_4xx
elif code >= 500 and code < 600:
emoji = EMOJI_5xx
elif code >= BADNESS:
emoji = EMOJI_DEAD
t = datetime.fromtimestamp(when, timezone.utc)
result.append('{0} attempt={1} code={2} time={3}'.format(emoji, attempt, code, t))
return result
def print_chunk(chunk, title, description=None, print_bar=True):
2019-11-16 09:24:14 -05:00
print(LINE)
print(H2, caps(title))
print()
if description:
print(description)
print()
2019-11-16 09:24:14 -05:00
for row in sort_using(chunk, 'site_name'):
url = row['onion_url']
padlock = EMOJI_HTTPS if url.startswith('https') else EMOJI_HTTP
print(H3, '[{site_name}]({onion_url})'.format(**row))
2019-11-16 09:24:14 -05:00
comment = get_placeholder(row, 'comment')
2019-11-16 16:02:54 -05:00
if comment != '-': print('*{}*'.format(comment))
print(B, '[{0}]({0})'.format(url), padlock)
2019-11-16 09:24:14 -05:00
# print proof unconditionally, as encouragement to fix it
2019-11-16 16:05:31 -05:00
print(B, 'proof: {0}'.format(get_proof(row)))
2019-11-16 09:24:14 -05:00
if print_bar:
for fetch in get_summary(url):
print(BB, fetch)
2019-11-16 09:24:14 -05:00
print()
def poolhook(x):
x.fetchwrap()
def do_fetch(master):
chunk = grep_using(master, 'flaky', YES, invert=True)
work = [ URL(x['onion_url']) for x in chunk ]
with Pool(POOL_WORKERS) as p: p.map(poolhook, work)
def print_index(cats):
print(LINE)
print(H1, 'Index')
print()
for cat in cats:
print(B, '[{0}](#{1})'.format(caps(cat), cat.lower().replace(' ', '-')))
print()
def do_print(master):
cats = get_categories(master)
print_index(cats)
for cat in cats:
chunk = grep_using(master, 'category', cat)
chunk = grep_using(chunk, 'flaky', YES, invert=True)
print_chunk(chunk, cat)
flaky = grep_using(master, 'flaky', YES)
print_chunk(flaky, 'Flaky Sites', description='These sites have apparently stopped responding.', print_bar=False)
2019-11-16 09:24:14 -05:00
if __name__ == '__main__':
master = None
# csv: category, site_name, flaky, onion_url, comment, proof_url
with open(MASTER_CSV, 'r') as fh:
dr = csv.DictReader(fh)
master = [ x for x in dr ]
GLOBAL_DB = Database(DB_FILENAME)
for arg in sys.argv[1:]:
if arg == 'fetch': do_fetch(master)
if arg == 'print': do_print(master)
GLOBAL_DB.close()