Merge branch 'master' into qa

* master: to avoid infinite loops in some cases, ignore the "claimed" field in the rethinkdb table "pages", because if a page is left "claimed", it must have been because of some error... site.claimed is the real claiming mechanism calm logging, don't print stacktrace on 430 from youtube-dl fix buglet in creation of new least_hops on pages table renaming scope rule "host" to "domain" to make it a less confusing, since rules apply to subdomains as well
2025-04-20 23:56:34 -04:00 · 2016-06-29 16:56:12 -05:00 · 2016-06-29 16:56:12 -05:00 · 8576c71c62
commit 8576c71c62
parent cf3004033f d04c5a31cc
4 changed files with 67 additions and 49 deletions
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -1,20 +1,20 @@
-#
-# brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
-#
-# Copyright (C) 2014-2016 Internet Archive
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
+'''
+brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
+
+Copyright (C) 2014-2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''

 import logging
 import brozzler
@ -68,8 +68,8 @@ class RethinkDbFrontier:
            # console
            self.r.table("pages").index_create(
                    "least_hops", [
-                        r.row["site_id"], r.row["brozzle_count"],
-                        r.row["hops_from_seed"]])
+                        self.r.row["site_id"], self.r.row["brozzle_count"],
+                        self.r.row["hops_from_seed"]])
        if not "jobs" in tables:
            self.logger.info(
                    "creating rethinkdb table 'jobs' in database %s",
@ -185,10 +185,19 @@ class RethinkDbFrontier:
            return False

    def claim_page(self, site, worker_id):
-        result = (self.r.table("pages")
-                .between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site")
-                .order_by(index=rethinkdb.desc("priority_by_site")).limit(1)
-                .update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run()
+        # ignores the "claimed" field of the page, because only one
+        # brozzler-worker can be working on a site at a time, and that would
+        # have to be the worker calling this method, so if something is claimed
+        # already, it must have been left that way because of some error
+        result = self.r.table("pages").between(
+                [site.id, 0, self.r.minval, self.r.minval],
+                [site.id, 0, self.r.maxval, self.r.maxval],
+                index="priority_by_site").order_by(
+                        index=rethinkdb.desc("priority_by_site")).limit(
+                                1).update({
+                                    "claimed":True,
+                                    "last_claimed_by":worker_id},
+                                    return_changes=True).run()
        self._vet_result(result, replaced=[0,1])
        if result["replaced"] == 1:
            return brozzler.Page(**result["changes"][0]["new_val"])
@ -196,7 +205,10 @@ class RethinkDbFrontier:
            raise brozzler.NothingToClaim

    def has_outstanding_pages(self, site):
-        results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run()
+        results_iter = self.r.table("pages").between(
+                [site.id, 0, self.r.minval, self.r.minval],
+                [site.id, 0, self.r.maxval, self.r.maxval],
+                index="priority_by_site").limit(1).run()
        return len(list(results_iter)) > 0

    def page(self, id):
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -1,20 +1,20 @@
-#
-# brozzler/site.py - classes representing sites and pages
-#
-# Copyright (C) 2014-2016 Internet Archive
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
+'''
+brozzler/site.py - classes representing sites and pages
+
+Copyright (C) 2014-2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''

 import surt
 import json
@ -170,7 +170,7 @@ class Site(brozzler.BaseDictable):
        Examples of valid rules:
        [
            {
-                "host": "monkey.org",
+                "domain": "monkey.org",
                "url_match": "STRING_MATCH",
                "value": "bar",
            },
@ -179,7 +179,7 @@ class Site(brozzler.BaseDictable):
                "value": "http://(com,woop,)/fuh/",
            },
            {
-                "host": "badhost.com",
+                "domain": "bad.domain.com",
            },
        ]
        """
@ -188,7 +188,7 @@ class Site(brozzler.BaseDictable):
        else:
            u = url

-        if "host" in rule and not u.matches_ip_or_domain(rule["host"]):
+        if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
            return False
        if "url_match" in rule:
            if rule["url_match"] == "STRING_MATCH":
@ -207,7 +207,7 @@ class Site(brozzler.BaseDictable):
                self.logger.warn("invalid rule.url_match=%s", rule.url_match)
                return False
        else:
-            if "host" in rule:
+            if "domain" in rule:
                # we already know that it matches from earlier check
                return True
            else:
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -223,9 +223,15 @@ class BrozzlerWorker:
                self._try_youtube_dl(ydl, site, page)
        except brozzler.ReachedLimit as e:
            raise
-        except:
-            self.logger.error("youtube_dl raised exception on %s",
-                              page, exc_info=True)
+        except Exception as e:
+            if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
+                    and e.exc_info[1].code == 430):
+                self.logger.info(
+                        'youtube-dl got %s %s processing %s',
+                        e.exc_info[1].code, e.exc_info[1].msg, page.url)
+            else:
+                self.logger.error(
+                        "youtube_dl raised exception on %s", page, exc_info=True)

        if self._needs_browsing(page, ydl_spy):
            self.logger.info('needs browsing: %s', page)
--- a/setup.py
+++ b/setup.py
@ -21,7 +21,7 @@ import setuptools

 setuptools.setup(
        name='brozzler',
-        version='1.1.dev27',
+        version='1.1.dev31',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',