mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #178 from galgeek/ARI-5995-tidied
ARI-5995 instagram capture updates
This commit is contained in:
commit
802fbff986
@ -35,12 +35,16 @@
|
|||||||
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
|
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
|
||||||
behavior_js_template: umbraBehavior.js.j2
|
behavior_js_template: umbraBehavior.js.j2
|
||||||
default_parameters:
|
default_parameters:
|
||||||
|
interval: 500
|
||||||
actions:
|
actions:
|
||||||
- selector: button.coreSpriteDismissLarge
|
- selector: .glyphsSpriteGrey_Close
|
||||||
|
rmSelector: '.RnEpo'
|
||||||
- selector: 'a>.eLAPa>.KL4Bh'
|
- selector: 'a>.eLAPa>.KL4Bh'
|
||||||
firstMatchOnly: true
|
limit: 1
|
||||||
|
rmSelector: '.RnEpo'
|
||||||
- selector: a.coreSpriteRightPaginationArrow
|
- selector: a.coreSpriteRightPaginationArrow
|
||||||
repeatSameElement: true
|
repeatSameElement: true
|
||||||
|
rmSelector: '.RnEpo'
|
||||||
-
|
-
|
||||||
url_regex: '^https?://americaspresidents\.si\.edu/gallery.*$'
|
url_regex: '^https?://americaspresidents\.si\.edu/gallery.*$'
|
||||||
behavior_js_template: umbraBehavior.js.j2
|
behavior_js_template: umbraBehavior.js.j2
|
||||||
@ -48,7 +52,7 @@
|
|||||||
interval: 2500
|
interval: 2500
|
||||||
actions:
|
actions:
|
||||||
- selector: div.see-more
|
- selector: div.see-more
|
||||||
firstMatchOnly: true
|
limit: 1
|
||||||
- selector: li.next
|
- selector: li.next
|
||||||
repeatSameElement: true
|
repeatSameElement: true
|
||||||
-
|
-
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class
|
* brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class
|
||||||
*
|
*
|
||||||
* Copyright (C) 2017-2018 Internet Archive
|
* Copyright (C) 2017-2019 Internet Archive
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
@ -35,8 +35,9 @@ class UmbraBehavior {
|
|||||||
var selector = this.actions[k].selector;
|
var selector = this.actions[k].selector;
|
||||||
var childSelector = this.actions[k].childSelector;
|
var childSelector = this.actions[k].childSelector;
|
||||||
var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false;
|
var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false;
|
||||||
var firstMatchOnly = this.actions[k].firstMatchOnly ? this.actions[k].firstMatchOnly : false;
|
var limit = this.actions[k].limit ? this.actions[k].limit : false;
|
||||||
var action = this.actions[k].do ? this.actions[k].do : 'click';
|
var action = this.actions[k].do ? this.actions[k].do : 'click';
|
||||||
|
var rmSelector = this.actions[k].rmSelector ? this.actions[k].rmSelector : null;
|
||||||
var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null;
|
var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null;
|
||||||
var didSomething = false;
|
var didSomething = false;
|
||||||
var somethingLeftAbove = false;
|
var somethingLeftAbove = false;
|
||||||
@ -59,6 +60,16 @@ class UmbraBehavior {
|
|||||||
|
|
||||||
var documentsLength = documents.length;
|
var documentsLength = documents.length;
|
||||||
for (var j = 0; j < documentsLength; j++) {
|
for (var j = 0; j < documentsLength; j++) {
|
||||||
|
if (rmSelector) {
|
||||||
|
var rmTargets = documents[j].querySelectorAll(rmSelector);
|
||||||
|
for (var i = 0; i < rmTargets.length; i++) {
|
||||||
|
if (this.isVisible(rmTargets[i])) {
|
||||||
|
rmTargets[i].remove();
|
||||||
|
didSomething = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if (closeSelector) {
|
if (closeSelector) {
|
||||||
var closeTargets = documents[j].querySelectorAll(closeSelector);
|
var closeTargets = documents[j].querySelectorAll(closeSelector);
|
||||||
for (var i = 0; i < closeTargets.length; i++) {
|
for (var i = 0; i < closeTargets.length; i++) {
|
||||||
@ -70,18 +81,18 @@ class UmbraBehavior {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (firstMatchOnly) {
|
var doTargets = documents[j].querySelectorAll(selector);
|
||||||
var doTargets = [ documents[j].querySelector(selector) ];
|
|
||||||
} else {
|
var repeats = doTargets.length;
|
||||||
var doTargets = documents[j].querySelectorAll(selector);
|
if (limit && limit < repeats) {
|
||||||
|
repeats = limit;
|
||||||
}
|
}
|
||||||
|
|
||||||
var doTargetsLength = doTargets.length;
|
if (!(repeats > 0)) {
|
||||||
if (!(doTargetsLength > 0)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( var i = 0; i < doTargetsLength; i++) {
|
for ( var i = 0; i < repeats; i++) {
|
||||||
if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) {
|
if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -110,6 +121,11 @@ class UmbraBehavior {
|
|||||||
somethingLeftAbove = true;
|
somethingLeftAbove = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (limit && limit == i) {
|
||||||
|
this.nextAction();
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!didSomething) {
|
if (!didSomething) {
|
||||||
@ -129,10 +145,7 @@ class UmbraBehavior {
|
|||||||
} else {
|
} else {
|
||||||
var idleTimeMs = Date.now() - this.idleSince;
|
var idleTimeMs = Date.now() - this.idleSince;
|
||||||
if ((idleTimeMs / 1000) > (this.IDLE_TIMEOUT_SEC - 1) && (this.index < (this.actions.length - 1))) {
|
if ((idleTimeMs / 1000) > (this.IDLE_TIMEOUT_SEC - 1) && (this.index < (this.actions.length - 1))) {
|
||||||
console.log("ready for next action");
|
this.nextAction();
|
||||||
this.index += 1;
|
|
||||||
this.idleSince = null;
|
|
||||||
window.scroll(0,0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -170,6 +183,13 @@ class UmbraBehavior {
|
|||||||
this.idleSince = null;
|
this.idleSince = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nextAction() {
|
||||||
|
console.log("ready for next action");
|
||||||
|
this.index += 1;
|
||||||
|
this.idleSince = null;
|
||||||
|
window.scroll(0,0);
|
||||||
|
}
|
||||||
|
|
||||||
start() {
|
start() {
|
||||||
var that = this;
|
var that = this;
|
||||||
this.intervalId = setInterval(function() {
|
this.intervalId = setInterval(function() {
|
||||||
|
@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site):
|
|||||||
if ie_result.get('_type') == 'playlist':
|
if ie_result.get('_type') == 'playlist':
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
||||||
if ie.IE_NAME == 'youtube:playlist':
|
if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}:
|
||||||
# At this point ie_result['entries'] is an iterator that
|
# At this point ie_result['entries'] is an iterator that
|
||||||
# will fetch more metadata from youtube to list all the
|
# will fetch more metadata from youtube to list all the
|
||||||
# videos. We unroll that iterator here partly because
|
# videos. We unroll that iterator here partly because
|
||||||
@ -163,17 +163,9 @@ def _build_youtube_dl(worker, destdir, site):
|
|||||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
||||||
ie_result['entries'] = []
|
ie_result['entries'] = []
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'not downloading %s videos from this youtube '
|
'not downloading %s media files from this '
|
||||||
'playlist because we expect to capture them from '
|
'playlist because we expect to capture them from '
|
||||||
'individual watch pages',
|
'individual watch/track/detail pages',
|
||||||
len(ie_result['entries_no_dl']))
|
|
||||||
elif ie.IE_NAME == 'soundcloud:user':
|
|
||||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
|
||||||
ie_result['entries'] = []
|
|
||||||
self.logger.info(
|
|
||||||
'not downloading %s tracks from this soundcloud '
|
|
||||||
'user page because we expect to capture them from '
|
|
||||||
'individual track pages',
|
|
||||||
len(ie_result['entries_no_dl']))
|
len(ie_result['entries_no_dl']))
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user