mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
commit
c685a4432c
3 changed files with 36 additions and 3 deletions
|
@ -37,6 +37,7 @@ import base64
|
||||||
import psutil
|
import psutil
|
||||||
import signal
|
import signal
|
||||||
import string
|
import string
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
__all__ = ["BrowserPool", "Browser"]
|
__all__ = ["BrowserPool", "Browser"]
|
||||||
|
|
||||||
|
@ -128,11 +129,23 @@ class Browser:
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
def start(self, proxy=None):
|
def start(self, proxy=None, cookie_db=None):
|
||||||
if not self._chrome_instance:
|
if not self._chrome_instance:
|
||||||
# these can raise exceptions
|
# these can raise exceptions
|
||||||
self.chrome_port = self._find_available_port()
|
self.chrome_port = self._find_available_port()
|
||||||
self._work_dir = tempfile.TemporaryDirectory()
|
self._work_dir = tempfile.TemporaryDirectory()
|
||||||
|
if cookie_db is not None:
|
||||||
|
cookie_dir = os.sep.join([self._work_dir.name, "chrome-user-data","Default"])
|
||||||
|
cookie_location = os.sep.join([cookie_dir,"Cookies"])
|
||||||
|
self.logger.debug("Cookie DB provided. Writing to: %s", cookie_location)
|
||||||
|
os.makedirs(cookie_dir, exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(cookie_location,'wb') as cookie_file:
|
||||||
|
cookie_file.write(cookie_db)
|
||||||
|
except OSError:
|
||||||
|
self.logger.error("exception writing cookie file at: %s", cookie_location, exc_info=True)
|
||||||
|
|
||||||
self._chrome_instance = Chrome(
|
self._chrome_instance = Chrome(
|
||||||
port=self.chrome_port, executable=self.chrome_exe,
|
port=self.chrome_port, executable=self.chrome_exe,
|
||||||
user_home_dir=self._work_dir.name,
|
user_home_dir=self._work_dir.name,
|
||||||
|
@ -161,6 +174,24 @@ class Browser:
|
||||||
except:
|
except:
|
||||||
self.logger.error("problem stopping", exc_info=True)
|
self.logger.error("problem stopping", exc_info=True)
|
||||||
|
|
||||||
|
def persist_and_read_cookie_db(self):
|
||||||
|
cookie_location = os.sep.join([self._work_dir.name, "chrome-user-data","Default","Cookies"])
|
||||||
|
self.logger.debug("Saving Cookie DB from: %s", cookie_location)
|
||||||
|
try:
|
||||||
|
with sqlite3.connect(cookie_location) as conn:
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("UPDATE cookies SET persistent = 1")
|
||||||
|
except sqlite3.Error:
|
||||||
|
self.logger.error("exception updating cookie DB", exc_info=True)
|
||||||
|
|
||||||
|
cookie_db=None
|
||||||
|
try:
|
||||||
|
with open(cookie_location, "rb") as cookie_file:
|
||||||
|
cookie_db=cookie_file.read()
|
||||||
|
except OSError:
|
||||||
|
self.logger.error("exception reading from cookie DB file at: %s", cookie_location, exc_info=True)
|
||||||
|
return cookie_db
|
||||||
|
|
||||||
def _find_available_port(self):
|
def _find_available_port(self):
|
||||||
port_available = False
|
port_available = False
|
||||||
port = self.chrome_port
|
port = self.chrome_port
|
||||||
|
|
|
@ -91,7 +91,7 @@ class Site(brozzler.BaseDictable):
|
||||||
enable_warcprox_features=False, reached_limit=None,
|
enable_warcprox_features=False, reached_limit=None,
|
||||||
status="ACTIVE", claimed=False, start_time=None,
|
status="ACTIVE", claimed=False, start_time=None,
|
||||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None):
|
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None):
|
||||||
|
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
|
@ -110,6 +110,7 @@ class Site(brozzler.BaseDictable):
|
||||||
self.last_claimed = last_claimed
|
self.last_claimed = last_claimed
|
||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
self.remember_outlinks = remember_outlinks
|
self.remember_outlinks = remember_outlinks
|
||||||
|
self.cookie_db = cookie_db
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
|
|
|
@ -257,7 +257,7 @@ class BrozzlerWorker:
|
||||||
if self._needs_browsing(page, ydl_spy):
|
if self._needs_browsing(page, ydl_spy):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info('needs browsing: %s', page)
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
browser.start(proxy=self._proxy(site))
|
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
||||||
outlinks = browser.browse_page(
|
outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(),
|
page.url, extra_headers=site.extra_headers(),
|
||||||
on_screenshot=_on_screenshot,
|
on_screenshot=_on_screenshot,
|
||||||
|
@ -312,6 +312,7 @@ class BrozzlerWorker:
|
||||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||||
socket.gethostname(), browser.chrome_port))
|
socket.gethostname(), browser.chrome_port))
|
||||||
outlinks = self.brozzle_page(browser, site, page)
|
outlinks = self.brozzle_page(browser, site, page)
|
||||||
|
site.cookie_db=browser.persist_and_read_cookie_db()
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(
|
self._frontier.scope_and_schedule_outlinks(
|
||||||
site, page, outlinks)
|
site, page, outlinks)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue