Merge pull request #178 from galgeek/ARI-5995-tidied

ARI-5995 instagram capture updates
This commit is contained in:
Noah Levitt 2019-11-06 13:26:56 -08:00 committed by GitHub
commit 802fbff986
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 27 deletions

View File

@ -35,12 +35,16 @@
url_regex: '^https?://(?:www\.)?instagram\.com/.*$' url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js_template: umbraBehavior.js.j2 behavior_js_template: umbraBehavior.js.j2
default_parameters: default_parameters:
interval: 500
actions: actions:
- selector: button.coreSpriteDismissLarge - selector: .glyphsSpriteGrey_Close
rmSelector: '.RnEpo'
- selector: 'a>.eLAPa>.KL4Bh' - selector: 'a>.eLAPa>.KL4Bh'
firstMatchOnly: true limit: 1
rmSelector: '.RnEpo'
- selector: a.coreSpriteRightPaginationArrow - selector: a.coreSpriteRightPaginationArrow
repeatSameElement: true repeatSameElement: true
rmSelector: '.RnEpo'
- -
url_regex: '^https?://americaspresidents\.si\.edu/gallery.*$' url_regex: '^https?://americaspresidents\.si\.edu/gallery.*$'
behavior_js_template: umbraBehavior.js.j2 behavior_js_template: umbraBehavior.js.j2
@ -48,7 +52,7 @@
interval: 2500 interval: 2500
actions: actions:
- selector: div.see-more - selector: div.see-more
firstMatchOnly: true limit: 1
- selector: li.next - selector: li.next
repeatSameElement: true repeatSameElement: true
- -

View File

@ -1,7 +1,7 @@
/* /*
* brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class * brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class
* *
* Copyright (C) 2017-2018 Internet Archive * Copyright (C) 2017-2019 Internet Archive
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -35,8 +35,9 @@ class UmbraBehavior {
var selector = this.actions[k].selector; var selector = this.actions[k].selector;
var childSelector = this.actions[k].childSelector; var childSelector = this.actions[k].childSelector;
var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false; var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false;
var firstMatchOnly = this.actions[k].firstMatchOnly ? this.actions[k].firstMatchOnly : false; var limit = this.actions[k].limit ? this.actions[k].limit : false;
var action = this.actions[k].do ? this.actions[k].do : 'click'; var action = this.actions[k].do ? this.actions[k].do : 'click';
var rmSelector = this.actions[k].rmSelector ? this.actions[k].rmSelector : null;
var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null; var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null;
var didSomething = false; var didSomething = false;
var somethingLeftAbove = false; var somethingLeftAbove = false;
@ -59,6 +60,16 @@ class UmbraBehavior {
var documentsLength = documents.length; var documentsLength = documents.length;
for (var j = 0; j < documentsLength; j++) { for (var j = 0; j < documentsLength; j++) {
if (rmSelector) {
var rmTargets = documents[j].querySelectorAll(rmSelector);
for (var i = 0; i < rmTargets.length; i++) {
if (this.isVisible(rmTargets[i])) {
rmTargets[i].remove();
didSomething = true;
break;
}
}
}
if (closeSelector) { if (closeSelector) {
var closeTargets = documents[j].querySelectorAll(closeSelector); var closeTargets = documents[j].querySelectorAll(closeSelector);
for (var i = 0; i < closeTargets.length; i++) { for (var i = 0; i < closeTargets.length; i++) {
@ -70,18 +81,18 @@ class UmbraBehavior {
} }
} }
if (firstMatchOnly) { var doTargets = documents[j].querySelectorAll(selector);
var doTargets = [ documents[j].querySelector(selector) ];
} else { var repeats = doTargets.length;
var doTargets = documents[j].querySelectorAll(selector); if (limit && limit < repeats) {
repeats = limit;
} }
var doTargetsLength = doTargets.length; if (!(repeats > 0)) {
if (!(doTargetsLength > 0)) {
continue; continue;
} }
for ( var i = 0; i < doTargetsLength; i++) { for ( var i = 0; i < repeats; i++) {
if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) { if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) {
continue; continue;
} }
@ -110,6 +121,11 @@ class UmbraBehavior {
somethingLeftAbove = true; somethingLeftAbove = true;
} }
} }
if (limit && limit == i) {
this.nextAction();
break;
}
} }
if (!didSomething) { if (!didSomething) {
@ -129,10 +145,7 @@ class UmbraBehavior {
} else { } else {
var idleTimeMs = Date.now() - this.idleSince; var idleTimeMs = Date.now() - this.idleSince;
if ((idleTimeMs / 1000) > (this.IDLE_TIMEOUT_SEC - 1) && (this.index < (this.actions.length - 1))) { if ((idleTimeMs / 1000) > (this.IDLE_TIMEOUT_SEC - 1) && (this.index < (this.actions.length - 1))) {
console.log("ready for next action"); this.nextAction();
this.index += 1;
this.idleSince = null;
window.scroll(0,0);
} }
} }
} }
@ -170,6 +183,13 @@ class UmbraBehavior {
this.idleSince = null; this.idleSince = null;
} }
nextAction() {
console.log("ready for next action");
this.index += 1;
this.idleSince = null;
window.scroll(0,0);
}
start() { start() {
var that = this; var that = this;
this.intervalId = setInterval(function() { this.intervalId = setInterval(function() {

View File

@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site):
if ie_result.get('_type') == 'playlist': if ie_result.get('_type') == 'playlist':
self.logger.info( self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url) 'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME == 'youtube:playlist': if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}:
# At this point ie_result['entries'] is an iterator that # At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the # will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because # videos. We unroll that iterator here partly because
@ -163,17 +163,9 @@ def _build_youtube_dl(worker, destdir, site):
ie_result['entries_no_dl'] = list(ie_result['entries']) ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result['entries'] = [] ie_result['entries'] = []
self.logger.info( self.logger.info(
'not downloading %s videos from this youtube ' 'not downloading %s media files from this '
'playlist because we expect to capture them from ' 'playlist because we expect to capture them from '
'individual watch pages', 'individual watch/track/detail pages',
len(ie_result['entries_no_dl']))
elif ie.IE_NAME == 'soundcloud:user':
ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result['entries'] = []
self.logger.info(
'not downloading %s tracks from this soundcloud '
'user page because we expect to capture them from '
'individual track pages',
len(ie_result['entries_no_dl'])) len(ie_result['entries_no_dl']))
else: else:
self.logger.info( self.logger.info(