diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 3a23413..7df5092 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -1,20 +1,20 @@ -# -# brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# +''' +brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' import logging import brozzler @@ -68,8 +68,8 @@ class RethinkDbFrontier: # console self.r.table("pages").index_create( "least_hops", [ - r.row["site_id"], r.row["brozzle_count"], - r.row["hops_from_seed"]]) + self.r.row["site_id"], self.r.row["brozzle_count"], + self.r.row["hops_from_seed"]]) if not "jobs" in tables: self.logger.info( "creating rethinkdb table 'jobs' in database %s", @@ -185,10 +185,19 @@ class RethinkDbFrontier: return False def claim_page(self, site, worker_id): - result = (self.r.table("pages") - .between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site") - .order_by(index=rethinkdb.desc("priority_by_site")).limit(1) - .update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run() + # ignores the "claimed" field of the page, because only one + # brozzler-worker can be working on a site at a time, and that would + # have to be the worker calling this method, so if something is claimed + # already, it must have been left that way because of some error + result = self.r.table("pages").between( + [site.id, 0, self.r.minval, self.r.minval], + [site.id, 0, self.r.maxval, self.r.maxval], + index="priority_by_site").order_by( + index=rethinkdb.desc("priority_by_site")).limit( + 1).update({ + "claimed":True, + "last_claimed_by":worker_id}, + return_changes=True).run() self._vet_result(result, replaced=[0,1]) if result["replaced"] == 1: return brozzler.Page(**result["changes"][0]["new_val"]) @@ -196,7 +205,10 @@ class RethinkDbFrontier: raise brozzler.NothingToClaim def has_outstanding_pages(self, site): - results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run() + results_iter = self.r.table("pages").between( + [site.id, 0, self.r.minval, self.r.minval], + [site.id, 0, self.r.maxval, self.r.maxval], + index="priority_by_site").limit(1).run() return len(list(results_iter)) > 0 def page(self, id): diff --git a/brozzler/site.py b/brozzler/site.py index 74927bd..53593de 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -1,20 +1,20 @@ -# -# brozzler/site.py - classes representing sites and pages -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# +''' +brozzler/site.py - classes representing sites and pages + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' import surt import json @@ -170,7 +170,7 @@ class Site(brozzler.BaseDictable): Examples of valid rules: [ { - "host": "monkey.org", + "domain": "monkey.org", "url_match": "STRING_MATCH", "value": "bar", }, @@ -179,7 +179,7 @@ class Site(brozzler.BaseDictable): "value": "http://(com,woop,)/fuh/", }, { - "host": "badhost.com", + "domain": "bad.domain.com", }, ] """ @@ -188,7 +188,7 @@ class Site(brozzler.BaseDictable): else: u = url - if "host" in rule and not u.matches_ip_or_domain(rule["host"]): + if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): return False if "url_match" in rule: if rule["url_match"] == "STRING_MATCH": @@ -207,7 +207,7 @@ class Site(brozzler.BaseDictable): self.logger.warn("invalid rule.url_match=%s", rule.url_match) return False else: - if "host" in rule: + if "domain" in rule: # we already know that it matches from earlier check return True else: diff --git a/brozzler/worker.py b/brozzler/worker.py index 62f9148..dc1fc7b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -223,9 +223,15 @@ class BrozzlerWorker: self._try_youtube_dl(ydl, site, page) except brozzler.ReachedLimit as e: raise - except: - self.logger.error("youtube_dl raised exception on %s", - page, exc_info=True) + except Exception as e: + if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 + and e.exc_info[1].code == 430): + self.logger.info( + 'youtube-dl got %s %s processing %s', + e.exc_info[1].code, e.exc_info[1].msg, page.url) + else: + self.logger.error( + "youtube_dl raised exception on %s", page, exc_info=True) if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) diff --git a/setup.py b/setup.py index 371d04f..dc2ac7d 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1.dev27', + version='1.1.dev31', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',