missed a spot where is_permitted_by_robots needs monkeying

This commit is contained in:
Noah Levitt 2018-05-15 16:52:48 -07:00
parent de1f240e25
commit 1572fd3ed6

View file

@ -1000,10 +1000,15 @@ def test_max_hops_off():
assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True
assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False
# two of these are in scope because of max_hops_off orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
frontier.scope_and_schedule_outlinks(site, seed_page, [ brozzler.is_permitted_by_robots = lambda *args: True
'http://foo.org/', 'https://example.com/toot', try:
'http://example.com/toot', 'https://some.bad.domain/something']) # two of these are in scope because of max_hops_off
frontier.scope_and_schedule_outlinks(site, seed_page, [
'http://foo.org/', 'https://example.com/toot',
'http://example.com/toot', 'https://some.bad.domain/something'])
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
@ -1062,8 +1067,13 @@ def test_max_hops_off():
# next hop is past max_hops_off, but normal in scope url is in scope # next hop is past max_hops_off, but normal in scope url is in scope
foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0]
frontier.scope_and_schedule_outlinks(site, foo_page, [ orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
'http://foo.org/bar', 'http://example.com/blah']) brozzler.is_permitted_by_robots = lambda *args: True
try:
frontier.scope_and_schedule_outlinks(site, foo_page, [
'http://foo.org/bar', 'http://example.com/blah'])
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
assert foo_page == { assert foo_page == {
'brozzle_count': 0, 'brozzle_count': 0,
'claimed': False, 'claimed': False,