brozzler/umbra/url.py
2015-07-11 02:29:19 -07:00

44 lines
1.3 KiB
Python

# vim: set sw=4 et:
import surt
import json
class CrawlUrl:
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
self.id = id
self.site_id = site_id
self.url = url
self.hops_from_seed = hops_from_seed
self._canon_hurl = None
self.outlinks = outlinks
def __repr__(self):
return """CrawlUrl(url="{}",site_id={},hops_from_seed={})""".format(
self.url, self.site_id, self.hops_from_seed)
def calc_priority(self):
priority = 0
priority += max(0, 10 - self.hops_from_seed)
priority += max(0, 6 - self.canonical().count("/"))
return priority
def canonical(self):
if self._canon_hurl is None:
self._canon_hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
return self._canon_hurl.geturl()
def to_dict(self):
if self.outlinks is not None and not isinstance(self.outlinks, list):
outlinks = []
outlinks.extend(self.outlinks)
else:
outlinks = self.outlinks
return dict(id=self.id, site_id=self.site_id, url=self.url,
hops_from_seed=self.hops_from_seed, outlinks=outlinks)
def to_json(self):
return json.dumps(self.to_dict(), separators=(',', ':'))