From c62c9f9063a586aaee4502a5ef1d38177b924b9b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 8 Oct 2019 17:10:18 -0700 Subject: [PATCH 1/5] delay instagram youtube-dl captures; collapse if block --- brozzler/ydl.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index aac242a..17babcd 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site): if ie_result.get('_type') == 'playlist': self.logger.info( 'extractor %r found playlist in %s', ie.IE_NAME, url) - if ie.IE_NAME == 'youtube:playlist': + if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}: # At this point ie_result['entries'] is an iterator that # will fetch more metadata from youtube to list all the # videos. We unroll that iterator here partly because @@ -163,17 +163,9 @@ def _build_youtube_dl(worker, destdir, site): ie_result['entries_no_dl'] = list(ie_result['entries']) ie_result['entries'] = [] self.logger.info( - 'not downloading %s videos from this youtube ' + 'not downloading %s media files from this ' 'playlist because we expect to capture them from ' - 'individual watch pages', - len(ie_result['entries_no_dl'])) - elif ie.IE_NAME == 'soundcloud:user': - ie_result['entries_no_dl'] = list(ie_result['entries']) - ie_result['entries'] = [] - self.logger.info( - 'not downloading %s tracks from this soundcloud ' - 'user page because we expect to capture them from ' - 'individual track pages', + 'individual watch/track/detail pages', len(ie_result['entries_no_dl'])) else: self.logger.info( From 66a29dc8fe84d5692912c1156a3d95e6f3ea40ab Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 8 Oct 2019 17:32:24 -0700 Subject: [PATCH 2/5] update first close selector --- brozzler/behaviors.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 7771ed0..f395259 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -35,7 +35,7 @@ behavior_js_template: umbraBehavior.js.j2 default_parameters: actions: - - selector: button.coreSpriteDismissLarge + - selector: .glyphsSpriteGrey_Close - selector: 'a>.eLAPa>.KL4Bh' firstMatchOnly: true - selector: a.coreSpriteRightPaginationArrow From ddf19121fd309665f9def7055834ad32e13ff2d8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 15 Oct 2019 11:53:57 -0700 Subject: [PATCH 3/5] limit=1 not firstMatchOnly plus nextAction --- brozzler/behaviors.yaml | 4 +-- brozzler/js-templates/umbraBehavior.js.j2 | 35 ++++++++++++++--------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index f395259..7031cec 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -37,7 +37,7 @@ actions: - selector: .glyphsSpriteGrey_Close - selector: 'a>.eLAPa>.KL4Bh' - firstMatchOnly: true + limit: 1 - selector: a.coreSpriteRightPaginationArrow repeatSameElement: true - @@ -47,7 +47,7 @@ interval: 2500 actions: - selector: div.see-more - firstMatchOnly: true + limit: 1 - selector: li.next repeatSameElement: true - diff --git a/brozzler/js-templates/umbraBehavior.js.j2 b/brozzler/js-templates/umbraBehavior.js.j2 index 7931a62..ffad126 100644 --- a/brozzler/js-templates/umbraBehavior.js.j2 +++ b/brozzler/js-templates/umbraBehavior.js.j2 @@ -1,7 +1,7 @@ /* * brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class * - * Copyright (C) 2017-2018 Internet Archive + * Copyright (C) 2017-2019 Internet Archive * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ class UmbraBehavior { var k = this.index; var selector = this.actions[k].selector; var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false; - var firstMatchOnly = this.actions[k].firstMatchOnly ? this.actions[k].firstMatchOnly : false; + var limit = this.actions[k].limit ? this.actions[k].limit : false; var action = this.actions[k].do ? this.actions[k].do : 'click'; var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null; var didSomething = false; @@ -69,18 +69,18 @@ class UmbraBehavior { } } - if (firstMatchOnly) { - var doTargets = [ documents[j].querySelector(selector) ]; - } else { - var doTargets = documents[j].querySelectorAll(selector); + var doTargets = documents[j].querySelectorAll(selector); + + var repeats = doTargets.length; + if (limit && limit < repeats) { + repeats = limit; } - var doTargetsLength = doTargets.length; - if (!(doTargetsLength > 0)) { + if (!(repeats > 0)) { continue; } - for ( var i = 0; i < doTargetsLength; i++) { + for ( var i = 0; i < repeats; i++) { if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) { continue; } @@ -98,6 +98,11 @@ class UmbraBehavior { somethingLeftAbove = true; } } + + if (didSomething && limit && limit === i+1) { + this.nextAction(); + break; + } } if (!didSomething) { @@ -117,10 +122,7 @@ class UmbraBehavior { } else { var idleTimeMs = Date.now() - this.idleSince; if ((idleTimeMs / 1000) > (this.IDLE_TIMEOUT_SEC - 1) && (this.index < (this.actions.length - 1))) { - console.log("ready for next action"); - this.index += 1; - this.idleSince = null; - window.scroll(0,0); + this.nextAction(); } } } @@ -158,6 +160,13 @@ class UmbraBehavior { this.idleSince = null; } + nextAction() { + console.log("ready for next action"); + this.index += 1; + this.idleSince = null; + window.scroll(0,0); + } + start() { var that = this; this.intervalId = setInterval(function() { From 37e1c7ed55e7c9ff4d75034675597dcd4b354af6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 17 Oct 2019 18:03:12 -0700 Subject: [PATCH 4/5] rmSelector to remove() login div --- brozzler/behaviors.yaml | 3 +++ brozzler/js-templates/umbraBehavior.js.j2 | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 7031cec..f1475ed 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -36,10 +36,13 @@ default_parameters: actions: - selector: .glyphsSpriteGrey_Close + rmSelector: '.RnEpo' - selector: 'a>.eLAPa>.KL4Bh' limit: 1 + rmSelector: '.RnEpo' - selector: a.coreSpriteRightPaginationArrow repeatSameElement: true + rmSelector: '.RnEpo' - url_regex: '^https?://americaspresidents\.si\.edu/gallery.*$' behavior_js_template: umbraBehavior.js.j2 diff --git a/brozzler/js-templates/umbraBehavior.js.j2 b/brozzler/js-templates/umbraBehavior.js.j2 index ffad126..1daf39b 100644 --- a/brozzler/js-templates/umbraBehavior.js.j2 +++ b/brozzler/js-templates/umbraBehavior.js.j2 @@ -36,6 +36,7 @@ class UmbraBehavior { var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false; var limit = this.actions[k].limit ? this.actions[k].limit : false; var action = this.actions[k].do ? this.actions[k].do : 'click'; + var rmSelector = this.actions[k].rmSelector ? this.actions[k].rmSelector : null; var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null; var didSomething = false; var somethingLeftAbove = false; @@ -58,6 +59,16 @@ class UmbraBehavior { var documentsLength = documents.length; for (var j = 0; j < documentsLength; j++) { + if (rmSelector) { + var rmTargets = documents[j].querySelectorAll(rmSelector); + for (var i = 0; i < rmTargets.length; i++) { + if (this.isVisible(rmTargets[i])) { + rmTargets[i].remove(); + didSomething = true; + break; + } + } + } if (closeSelector) { var closeTargets = documents[j].querySelectorAll(closeSelector); for (var i = 0; i < closeTargets.length; i++) { From ac4a3f9914c5643025dde88b8a16c017daa345a8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 5 Nov 2019 17:23:01 -0800 Subject: [PATCH 5/5] simpler check, interval; 500 --- brozzler/behaviors.yaml | 1 + brozzler/js-templates/umbraBehavior.js.j2 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index f1475ed..5157dbc 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -34,6 +34,7 @@ url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js_template: umbraBehavior.js.j2 default_parameters: + interval: 500 actions: - selector: .glyphsSpriteGrey_Close rmSelector: '.RnEpo' diff --git a/brozzler/js-templates/umbraBehavior.js.j2 b/brozzler/js-templates/umbraBehavior.js.j2 index 1daf39b..a7c113a 100644 --- a/brozzler/js-templates/umbraBehavior.js.j2 +++ b/brozzler/js-templates/umbraBehavior.js.j2 @@ -110,7 +110,7 @@ class UmbraBehavior { } } - if (didSomething && limit && limit === i+1) { + if (limit && limit == i) { this.nextAction(); break; }