Merge pull request #178 from galgeek/ARI-5995-tidied

ARI-5995 instagram capture updates
This commit is contained in:
Noah Levitt 2019-11-06 13:26:56 -08:00 committed by GitHub
commit 802fbff986
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 27 deletions

View File

@ -35,12 +35,16 @@
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js_template: umbraBehavior.js.j2
default_parameters:
interval: 500
actions:
- selector: button.coreSpriteDismissLarge
- selector: .glyphsSpriteGrey_Close
rmSelector: '.RnEpo'
- selector: 'a>.eLAPa>.KL4Bh'
firstMatchOnly: true
limit: 1
rmSelector: '.RnEpo'
- selector: a.coreSpriteRightPaginationArrow
repeatSameElement: true
rmSelector: '.RnEpo'
-
url_regex: '^https?://americaspresidents\.si\.edu/gallery.*$'
behavior_js_template: umbraBehavior.js.j2
@ -48,7 +52,7 @@
interval: 2500
actions:
- selector: div.see-more
firstMatchOnly: true
limit: 1
- selector: li.next
repeatSameElement: true
-

View File

@ -1,7 +1,7 @@
/*
* brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class
*
* Copyright (C) 2017-2018 Internet Archive
* Copyright (C) 2017-2019 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -35,8 +35,9 @@ class UmbraBehavior {
var selector = this.actions[k].selector;
var childSelector = this.actions[k].childSelector;
var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false;
var firstMatchOnly = this.actions[k].firstMatchOnly ? this.actions[k].firstMatchOnly : false;
var limit = this.actions[k].limit ? this.actions[k].limit : false;
var action = this.actions[k].do ? this.actions[k].do : 'click';
var rmSelector = this.actions[k].rmSelector ? this.actions[k].rmSelector : null;
var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null;
var didSomething = false;
var somethingLeftAbove = false;
@ -59,6 +60,16 @@ class UmbraBehavior {
var documentsLength = documents.length;
for (var j = 0; j < documentsLength; j++) {
if (rmSelector) {
var rmTargets = documents[j].querySelectorAll(rmSelector);
for (var i = 0; i < rmTargets.length; i++) {
if (this.isVisible(rmTargets[i])) {
rmTargets[i].remove();
didSomething = true;
break;
}
}
}
if (closeSelector) {
var closeTargets = documents[j].querySelectorAll(closeSelector);
for (var i = 0; i < closeTargets.length; i++) {
@ -70,18 +81,18 @@ class UmbraBehavior {
}
}
if (firstMatchOnly) {
var doTargets = [ documents[j].querySelector(selector) ];
} else {
var doTargets = documents[j].querySelectorAll(selector);
var doTargets = documents[j].querySelectorAll(selector);
var repeats = doTargets.length;
if (limit && limit < repeats) {
repeats = limit;
}
var doTargetsLength = doTargets.length;
if (!(doTargetsLength > 0)) {
if (!(repeats > 0)) {
continue;
}
for ( var i = 0; i < doTargetsLength; i++) {
for ( var i = 0; i < repeats; i++) {
if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) {
continue;
}
@ -110,6 +121,11 @@ class UmbraBehavior {
somethingLeftAbove = true;
}
}
if (limit && limit == i) {
this.nextAction();
break;
}
}
if (!didSomething) {
@ -129,10 +145,7 @@ class UmbraBehavior {
} else {
var idleTimeMs = Date.now() - this.idleSince;
if ((idleTimeMs / 1000) > (this.IDLE_TIMEOUT_SEC - 1) && (this.index < (this.actions.length - 1))) {
console.log("ready for next action");
this.index += 1;
this.idleSince = null;
window.scroll(0,0);
this.nextAction();
}
}
}
@ -170,6 +183,13 @@ class UmbraBehavior {
this.idleSince = null;
}
nextAction() {
console.log("ready for next action");
this.index += 1;
this.idleSince = null;
window.scroll(0,0);
}
start() {
var that = this;
this.intervalId = setInterval(function() {

View File

@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site):
if ie_result.get('_type') == 'playlist':
self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME == 'youtube:playlist':
if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}:
# At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because
@ -163,17 +163,9 @@ def _build_youtube_dl(worker, destdir, site):
ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result['entries'] = []
self.logger.info(
'not downloading %s videos from this youtube '
'not downloading %s media files from this '
'playlist because we expect to capture them from '
'individual watch pages',
len(ie_result['entries_no_dl']))
elif ie.IE_NAME == 'soundcloud:user':
ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result['entries'] = []
self.logger.info(
'not downloading %s tracks from this soundcloud '
'user page because we expect to capture them from '
'individual track pages',
'individual watch/track/detail pages',
len(ie_result['entries_no_dl']))
else:
self.logger.info(