mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
* master: to avoid infinite loops in some cases, ignore the "claimed" field in the rethinkdb table "pages", because if a page is left "claimed", it must have been because of some error... site.claimed is the real claiming mechanism calm logging, don't print stacktrace on 430 from youtube-dl fix buglet in creation of new least_hops on pages table renaming scope rule "host" to "domain" to make it a less confusing, since rules apply to subdomains as well
This commit is contained in:
commit
8576c71c62
@ -1,20 +1,20 @@
|
||||
#
|
||||
# brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
'''
|
||||
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import logging
|
||||
import brozzler
|
||||
@ -68,8 +68,8 @@ class RethinkDbFrontier:
|
||||
# console
|
||||
self.r.table("pages").index_create(
|
||||
"least_hops", [
|
||||
r.row["site_id"], r.row["brozzle_count"],
|
||||
r.row["hops_from_seed"]])
|
||||
self.r.row["site_id"], self.r.row["brozzle_count"],
|
||||
self.r.row["hops_from_seed"]])
|
||||
if not "jobs" in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'jobs' in database %s",
|
||||
@ -185,10 +185,19 @@ class RethinkDbFrontier:
|
||||
return False
|
||||
|
||||
def claim_page(self, site, worker_id):
|
||||
result = (self.r.table("pages")
|
||||
.between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site")
|
||||
.order_by(index=rethinkdb.desc("priority_by_site")).limit(1)
|
||||
.update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run()
|
||||
# ignores the "claimed" field of the page, because only one
|
||||
# brozzler-worker can be working on a site at a time, and that would
|
||||
# have to be the worker calling this method, so if something is claimed
|
||||
# already, it must have been left that way because of some error
|
||||
result = self.r.table("pages").between(
|
||||
[site.id, 0, self.r.minval, self.r.minval],
|
||||
[site.id, 0, self.r.maxval, self.r.maxval],
|
||||
index="priority_by_site").order_by(
|
||||
index=rethinkdb.desc("priority_by_site")).limit(
|
||||
1).update({
|
||||
"claimed":True,
|
||||
"last_claimed_by":worker_id},
|
||||
return_changes=True).run()
|
||||
self._vet_result(result, replaced=[0,1])
|
||||
if result["replaced"] == 1:
|
||||
return brozzler.Page(**result["changes"][0]["new_val"])
|
||||
@ -196,7 +205,10 @@ class RethinkDbFrontier:
|
||||
raise brozzler.NothingToClaim
|
||||
|
||||
def has_outstanding_pages(self, site):
|
||||
results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run()
|
||||
results_iter = self.r.table("pages").between(
|
||||
[site.id, 0, self.r.minval, self.r.minval],
|
||||
[site.id, 0, self.r.maxval, self.r.maxval],
|
||||
index="priority_by_site").limit(1).run()
|
||||
return len(list(results_iter)) > 0
|
||||
|
||||
def page(self, id):
|
||||
|
@ -1,20 +1,20 @@
|
||||
#
|
||||
# brozzler/site.py - classes representing sites and pages
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
'''
|
||||
brozzler/site.py - classes representing sites and pages
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import surt
|
||||
import json
|
||||
@ -170,7 +170,7 @@ class Site(brozzler.BaseDictable):
|
||||
Examples of valid rules:
|
||||
[
|
||||
{
|
||||
"host": "monkey.org",
|
||||
"domain": "monkey.org",
|
||||
"url_match": "STRING_MATCH",
|
||||
"value": "bar",
|
||||
},
|
||||
@ -179,7 +179,7 @@ class Site(brozzler.BaseDictable):
|
||||
"value": "http://(com,woop,)/fuh/",
|
||||
},
|
||||
{
|
||||
"host": "badhost.com",
|
||||
"domain": "bad.domain.com",
|
||||
},
|
||||
]
|
||||
"""
|
||||
@ -188,7 +188,7 @@ class Site(brozzler.BaseDictable):
|
||||
else:
|
||||
u = url
|
||||
|
||||
if "host" in rule and not u.matches_ip_or_domain(rule["host"]):
|
||||
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
||||
return False
|
||||
if "url_match" in rule:
|
||||
if rule["url_match"] == "STRING_MATCH":
|
||||
@ -207,7 +207,7 @@ class Site(brozzler.BaseDictable):
|
||||
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
||||
return False
|
||||
else:
|
||||
if "host" in rule:
|
||||
if "domain" in rule:
|
||||
# we already know that it matches from earlier check
|
||||
return True
|
||||
else:
|
||||
|
@ -223,9 +223,15 @@ class BrozzlerWorker:
|
||||
self._try_youtube_dl(ydl, site, page)
|
||||
except brozzler.ReachedLimit as e:
|
||||
raise
|
||||
except:
|
||||
self.logger.error("youtube_dl raised exception on %s",
|
||||
page, exc_info=True)
|
||||
except Exception as e:
|
||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||
and e.exc_info[1].code == 430):
|
||||
self.logger.info(
|
||||
'youtube-dl got %s %s processing %s',
|
||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||
else:
|
||||
self.logger.error(
|
||||
"youtube_dl raised exception on %s", page, exc_info=True)
|
||||
|
||||
if self._needs_browsing(page, ydl_spy):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
|
Loading…
x
Reference in New Issue
Block a user