diff --git a/brozzler/browser.py b/brozzler/browser.py index 76b1779..852974e 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -128,11 +128,19 @@ class Browser: def __exit__(self, *args): self.stop() - def start(self, proxy=None): + def start(self, proxy=None, cookieDb=None): if not self._chrome_instance: # these can raise exceptions self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() + if cookieDb is not None: + cookieLocation = os.sep.join([self._work_dir.name, "chrome-user-data","Default","Cookies"]) + try: + with open(cookieLocation,'w') as cookieFile: + cookieFile.write(cookieDb) + except EnvironmentError: + self.logger.error("exception writing cookie file at: %s", cookieLocation, exc_info=True) + self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, user_home_dir=self._work_dir.name, @@ -161,6 +169,24 @@ class Browser: except: self.logger.error("problem stopping", exc_info=True) + def read_cookie_db(self): + cookieLocation = os.sep.join([self._work_dir.name, "chrome-user-data","Default","Cookies"]) + + try: + with sqlite3.connect(cookieLocation) as conn: + cur = conn.cursor() + cur.execute("UPDATE cookies SET persistent = 1") + except sqlite3.Error: + self.logger.error("exception updating cookie DB", exc_info=True) + + cookieDb=None + try: + with open(cookieLocation, "rb") as cookieFile: + cookieDb=bytearray(cookieFile.read()) + except EnvironmentError: + self.logger.error("exception reading from cookie DB file at: %s", cookieLocation, exc_info=True) + return cookieDb + def _find_available_port(self): port_available = False port = self.chrome_port diff --git a/brozzler/site.py b/brozzler/site.py index 8167924..b90c82e 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -91,7 +91,7 @@ class Site(brozzler.BaseDictable): enable_warcprox_features=False, reached_limit=None, status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, - last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None): + last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookieDb=None): self.seed = seed self.id = id @@ -110,6 +110,7 @@ class Site(brozzler.BaseDictable): self.last_claimed = last_claimed self.metadata = metadata self.remember_outlinks = remember_outlinks + self.cookieDb = bytearray(cookieDb) self.scope = scope or {} if not "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index 36b14c8..5ba9d4a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -257,7 +257,7 @@ class BrozzlerWorker: if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) if not browser.is_running(): - browser.start(proxy=self._proxy(site)) + browser.start(proxy=self._proxy(site), cookieDb=site.cookieDb) outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), on_screenshot=_on_screenshot, @@ -312,6 +312,7 @@ class BrozzlerWorker: page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome_port)) outlinks = self.brozzle_page(browser, site, page) + site.cookieDb=browser.read_cookie_db() self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks)