add calisuck.py and start howto
This commit is contained in:
parent
68867b6316
commit
36168db750
@ -2,10 +2,10 @@
|
|||||||
|
|
||||||
Mirror of Krazybug's calibre scripts
|
Mirror of Krazybug's calibre scripts
|
||||||
|
|
||||||
-
|
## ToDo
|
||||||
ToDo
|
|
||||||
|
|
||||||
1. Create guide on how to find calibre instances and index them with calisuck
|
1. Create guide on how to find calibre instances and index them with calisuck
|
||||||
2. Howto export as sqlite.db so calishot can use it to act as a search engine of said instances
|
2. Howto export as sqlite.db so calishot can use it to act as a search engine of said instances
|
||||||
3. ????
|
3. ????
|
||||||
4. Profit
|
4. Profit
|
||||||
|
5. How to pull calibre URLs from Shodan.
|
21
calisuck/HowTo.md
Normal file
21
calisuck/HowTo.md
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# Howto
|
||||||
|
|
||||||
|
// You need python 3.5 at a minimum and initialize and activate a venv
|
||||||
|
python -m venv .
|
||||||
|
// Might need to activate the venv manually
|
||||||
|
. bin/activate
|
||||||
|
// Pre-reqs via pip
|
||||||
|
pip install requests fire humanize langid iso639 beautifultable
|
||||||
|
// help commands
|
||||||
|
python calisuck.py --help
|
||||||
|
python calisuck.py index-ebooks --help
|
||||||
|
python calisuck.py download-ebooks --help
|
||||||
|
python calisuck.py download-covers --help
|
||||||
|
|
||||||
|
# Where the hell do I find instances?
|
||||||
|
### Shodan :
|
||||||
|
Apparently searching for "calibre" in shodan gives you thousands of results. Unfortunately you can't filtre without making an account.
|
||||||
|
There has to be a way to automatically pull the URLs from Shodan but it's out of my current expertise.
|
||||||
|
|
||||||
|
###
|
||||||
|
|
819
calisuck/calisuck.py
Normal file
819
calisuck/calisuck.py
Normal file
@ -0,0 +1,819 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
'''
|
||||||
|
calisuck: index, filter-out smartly and download ebooks from Calibre open directories
|
||||||
|
Installation:
|
||||||
|
You need python 3.5 installed
|
||||||
|
Download the file as a zip and unzip-it and get into the dir
|
||||||
|
OR
|
||||||
|
> git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git
|
||||||
|
> mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck
|
||||||
|
> cd calisuck
|
||||||
|
>
|
||||||
|
THEN
|
||||||
|
> python3 -m venv .
|
||||||
|
> . bin/activate
|
||||||
|
> pip install requests fire humanize langid iso639 beautifultable
|
||||||
|
> python calisuck.py --help
|
||||||
|
> python calisuck.py index-ebooks --help
|
||||||
|
> python calisuck.py download-ebooks --help
|
||||||
|
> python calisuck.py download-covers --help
|
||||||
|
'''
|
||||||
|
|
||||||
|
'''
|
||||||
|
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||||
|
Version 2, December 2004
|
||||||
|
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||||
|
Everyone is permitted to copy and distribute verbatim or modified
|
||||||
|
copies of this license document, and changing it is allowed as long
|
||||||
|
as the name is changed.
|
||||||
|
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||||
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||||
|
0. You just DO WHAT THE FUCK YOU WANT TO.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import fire
|
||||||
|
from humanize import naturalsize as hsize
|
||||||
|
from langid.langid import LanguageIdentifier, model
|
||||||
|
import iso639
|
||||||
|
import time
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
import urllib.parse
|
||||||
|
import urllib3
|
||||||
|
from beautifultable import BeautifulTable
|
||||||
|
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
|
||||||
|
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
|
||||||
|
|
||||||
|
|
||||||
|
def load_metadata(path, uuid):
|
||||||
|
filepath=path+'/'+uuid+'/metadata.json'
|
||||||
|
# print (filepath)
|
||||||
|
if os.path.isfile(filepath):
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as fd:
|
||||||
|
return json.load(fd)
|
||||||
|
except:
|
||||||
|
print ("Error loading metadata for:", uuid, "from path:", path)
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
# print ("Metadata not found for:", uuid, "from path:", path)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def save_metadata(path, book):
|
||||||
|
filepath=path+'/'+book['uuid']+'/metadata.json'
|
||||||
|
# print("Saving book metadata for:", book['uuid'], "to:", filepath)
|
||||||
|
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
|
||||||
|
with open(filepath+".tmp", 'w') as fd:
|
||||||
|
json.dump(book, fd, indent=4, separators=(',', ': '))
|
||||||
|
try:
|
||||||
|
shutil.move(filepath+".tmp", filepath)
|
||||||
|
# print("Saved to:", filepath)
|
||||||
|
except:
|
||||||
|
print("Unable to rename .tmp file:", filepath+".tmp")
|
||||||
|
|
||||||
|
|
||||||
|
def get_cover_path(path, uuid):
|
||||||
|
filepath=path+'/'+uuid+'/cover.jpg'
|
||||||
|
if os.path.isfile(filepath): return filepath
|
||||||
|
else: return 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_path(path, uuid, fileformat):
|
||||||
|
files=os.listdir(path+'/'+uuid)
|
||||||
|
if files:
|
||||||
|
for f in files:
|
||||||
|
fname, ext=os.path.splitext(f)
|
||||||
|
if ext =='.'+fileformat:
|
||||||
|
return path+'/'+uuid+'/'+f
|
||||||
|
else: return 0
|
||||||
|
else: return 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_cover(path, book, map):
|
||||||
|
url=book['source']['cover']
|
||||||
|
if map:
|
||||||
|
pu=urllib.parse.urlparse(url)
|
||||||
|
pu=(pu[0], map, *pu[2:])
|
||||||
|
print(pu)
|
||||||
|
url=urllib.parse.urlunparse(pu)
|
||||||
|
|
||||||
|
print("Downloading cover from:", url)
|
||||||
|
|
||||||
|
r=requests.get(url, timeout=(20, 3), verify=False)
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
filepath=path+'/'+book['uuid']+'/cover.jpg'
|
||||||
|
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
|
||||||
|
with open(filepath+".tmp", 'wb') as fd:
|
||||||
|
fd.write(r.content)
|
||||||
|
shutil.move(filepath+".tmp", filepath)
|
||||||
|
print("Saved to:", filepath)
|
||||||
|
|
||||||
|
|
||||||
|
def download_covers(dir='my_books', server='', map=""):
|
||||||
|
""" Download covers for each books"""
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(dir, topdown=True):
|
||||||
|
for d in dirs:
|
||||||
|
# print()
|
||||||
|
# print("-->", d)
|
||||||
|
book = load_metadata(root, d)
|
||||||
|
if book:
|
||||||
|
# if book['source']['status'] != "ignored":
|
||||||
|
if True:
|
||||||
|
if not get_cover_path(root, book['uuid']):
|
||||||
|
print()
|
||||||
|
print("-->", d)
|
||||||
|
print(book['uuid'])
|
||||||
|
try:
|
||||||
|
get_cover(root, book, map)
|
||||||
|
except:
|
||||||
|
print ("Unable to get cover", book['uuid'])
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
# print ("Cover already present:", book['uuid'])
|
||||||
|
else:
|
||||||
|
print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
|
||||||
|
else:
|
||||||
|
print ("No ebook metadata found in:", root)
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_size(url):
|
||||||
|
print("Downloading size:", url)
|
||||||
|
r = requests.head(url, verify=False)
|
||||||
|
r.raise_for_status()
|
||||||
|
size=r.headers['Content-Length']
|
||||||
|
print("Size received="+ hsize(size))
|
||||||
|
return int(size)
|
||||||
|
|
||||||
|
|
||||||
|
def get_file(path, book, format, session, map, map_lib):
|
||||||
|
uuid = book['uuid']
|
||||||
|
url=book['source']['formats'][format]['url']
|
||||||
|
if map:
|
||||||
|
pu=urllib.parse.urlparse(url)
|
||||||
|
pu=(pu[0], map, *pu[2:])
|
||||||
|
print(pu)
|
||||||
|
url=urllib.parse.urlunparse(pu)
|
||||||
|
|
||||||
|
if map_lib:
|
||||||
|
# pu=urllib.parse.urlparse(url)
|
||||||
|
# print(pu)
|
||||||
|
url_s=url.split("/")
|
||||||
|
# print(url_s)
|
||||||
|
url_s=url_s[:-1]+[map_lib]
|
||||||
|
# print('/'.join(url_s))
|
||||||
|
url='/'.join(url_s)
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Downloading ebook:", url)
|
||||||
|
print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
|
||||||
|
r = session.get(url, timeout=(25,15), verify=False)
|
||||||
|
# headers = {"Range": "bytes=0-1023"}
|
||||||
|
# r = requests.get(url, headers=headers)
|
||||||
|
r.raise_for_status()
|
||||||
|
# print(r.headers)
|
||||||
|
if('Content-Length' in r.headers ):
|
||||||
|
print("Size received="+hsize(r.headers['Content-Length']))
|
||||||
|
else:
|
||||||
|
print("Fize received")
|
||||||
|
|
||||||
|
|
||||||
|
filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition'])
|
||||||
|
# print(filename)
|
||||||
|
if len(filename):
|
||||||
|
filepath=path+'/'+uuid+'/'+filename[0]
|
||||||
|
else:
|
||||||
|
filepath=path+'/'+uuid+'/'+uuid+"."+format
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
|
||||||
|
with open(filepath+".tmp", 'wb') as fd:
|
||||||
|
fd.write(r.content)
|
||||||
|
shutil.move(filepath+".tmp", filepath)
|
||||||
|
print("Saved to:", filepath)
|
||||||
|
|
||||||
|
|
||||||
|
def set_status(uuid, status, dir='.'):
|
||||||
|
book = load_metadata(dir, uuid)
|
||||||
|
if book:
|
||||||
|
if book['source']['status'] != status:
|
||||||
|
book['source']['status'] = status
|
||||||
|
save_metadata(dir, book)
|
||||||
|
print("Status changed to", status+":", book['uuid'], "(", book['title'], ")")
|
||||||
|
else:
|
||||||
|
print("Status unchanged changed ", status+":", book['uuid'])
|
||||||
|
else:
|
||||||
|
print ("No ebook metadata found for:", uuid)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_book(uuid, path='.'):
|
||||||
|
print(os.getcwd())
|
||||||
|
bookpath=path+'/'+uuid
|
||||||
|
if os.path.isdir(bookpath):
|
||||||
|
try:
|
||||||
|
shutil.rmtree(bookpath)
|
||||||
|
print(uuid, "removed")
|
||||||
|
except:
|
||||||
|
print("problem")
|
||||||
|
else:
|
||||||
|
print(uuid, "not found")
|
||||||
|
|
||||||
|
|
||||||
|
def update_done_status(book):
|
||||||
|
source=book['source']
|
||||||
|
if source['status']!='ignored':
|
||||||
|
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
|
||||||
|
book['source']['status']="done"
|
||||||
|
else:
|
||||||
|
book['source']['status']="todo"
|
||||||
|
|
||||||
|
|
||||||
|
def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False):
|
||||||
|
"""
|
||||||
|
Index a remote Calibre library
|
||||||
|
You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book.
|
||||||
|
They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program.
|
||||||
|
They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all
|
||||||
|
the different formats of the same book and eventually the cover file.
|
||||||
|
You can mix books from different sites without any (theoric) collisions
|
||||||
|
Params:
|
||||||
|
--site=<string> : Url of the site to index (ex: http://123.123.123.123/)
|
||||||
|
--library=<string> (default=my_books) : Id of library to index. The script index the default library by default.
|
||||||
|
The id is string following '&library_id=' in the url
|
||||||
|
--force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata
|
||||||
|
already gathered are ignored
|
||||||
|
--start=<int> (default=0)
|
||||||
|
--stop=<int> (default=0) : Allow indexing between a range of ebooks
|
||||||
|
|
||||||
|
--inc=<int> (default=1000) : Fix the number of ebooks for each request one the server
|
||||||
|
"""
|
||||||
|
|
||||||
|
os.makedirs(dir, exist_ok=True)
|
||||||
|
|
||||||
|
offset= 0 if not start else start-1
|
||||||
|
num=min(1000,inc)
|
||||||
|
server=site.rstrip('/')
|
||||||
|
api=server+'/ajax/'
|
||||||
|
library= '/'+library if library else library
|
||||||
|
|
||||||
|
print("Server:", server)
|
||||||
|
url=api+'search'+library+'?num=0'
|
||||||
|
print()
|
||||||
|
print("Getting ebooks count:", server)
|
||||||
|
try:
|
||||||
|
r = requests.get(url,verify=False)
|
||||||
|
r.raise_for_status()
|
||||||
|
except:
|
||||||
|
print("Unable to open site:", url)
|
||||||
|
sys.exit(1)
|
||||||
|
print("Total count=",r.json()["total_num"])
|
||||||
|
total_num=int(r.json()["total_num"])
|
||||||
|
total_num= total_num if not stop else stop
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Start indexing")
|
||||||
|
|
||||||
|
range=offset+1
|
||||||
|
while offset < total_num:
|
||||||
|
remaining_num = min(num, total_num - offset)
|
||||||
|
# print()
|
||||||
|
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
|
||||||
|
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
|
||||||
|
|
||||||
|
# print("->", url)
|
||||||
|
r=requests.get(url, verify=False)
|
||||||
|
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
|
||||||
|
|
||||||
|
# print()
|
||||||
|
# print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='')
|
||||||
|
books_s=",".join(str(i) for i in r.json()['book_ids'])
|
||||||
|
url=api+'books'+library+'?ids='+books_s
|
||||||
|
# print("->", url)
|
||||||
|
r=requests.get(url, verify=False)
|
||||||
|
# print(len(r.json()), "received")
|
||||||
|
|
||||||
|
for id, r_book in r.json().items():
|
||||||
|
uuid=r_book['uuid']
|
||||||
|
if not uuid:
|
||||||
|
print ("No uuid for ebook: ignored")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if r_book['authors']:
|
||||||
|
desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})"
|
||||||
|
else:
|
||||||
|
desc= f"uuid={uuid} ({r_book['title']})"
|
||||||
|
s=f"\r--> {range}/{total_num} - {desc}"
|
||||||
|
s='{:140.140}'.format(s)
|
||||||
|
print (s, end='')
|
||||||
|
|
||||||
|
if not force_refresh:
|
||||||
|
try:
|
||||||
|
book = load_metadata(dir, uuid)
|
||||||
|
except:
|
||||||
|
print()
|
||||||
|
print("Unable to get metadata from:", uuid)
|
||||||
|
range+=1
|
||||||
|
continue
|
||||||
|
if book:
|
||||||
|
# print("Metadata already present for:", uuid)
|
||||||
|
range+=1
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
if not r_book['formats']:
|
||||||
|
print()
|
||||||
|
print("No format found for {}".format(r_book['uuid']))
|
||||||
|
range+=1
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
book={}
|
||||||
|
url=api+'book/'+id
|
||||||
|
book['title']=r_book['title']
|
||||||
|
book['authors']=r_book['authors']
|
||||||
|
book['series']=r_book['series']
|
||||||
|
book['series_index']=r_book['series_index']
|
||||||
|
book['edition']=0
|
||||||
|
book['uuid']=r_book['uuid']
|
||||||
|
book['identifiers']=r_book['identifiers']
|
||||||
|
book['comments']=r_book['comments']
|
||||||
|
book['pubdate']=r_book['pubdate']
|
||||||
|
book['publisher']=r_book['publisher']
|
||||||
|
languages=r_book['languages']
|
||||||
|
if not languages:
|
||||||
|
# if True:
|
||||||
|
if book['comments']:
|
||||||
|
text=book['comments']
|
||||||
|
else:
|
||||||
|
text=book['title']
|
||||||
|
s_language, prob=identifier.classify(text)
|
||||||
|
if prob >= 0.85:
|
||||||
|
language = iso639.to_iso639_2(s_language)
|
||||||
|
book['languages']=[language]
|
||||||
|
else:
|
||||||
|
book['languages']=[]
|
||||||
|
else:
|
||||||
|
book['languages']=[]
|
||||||
|
for l in languages:
|
||||||
|
book['languages'].append(iso639.to_iso639_2(l))
|
||||||
|
|
||||||
|
book['tags']=r_book['tags']
|
||||||
|
book['formats']=[]
|
||||||
|
book['metadata_version']=0.1
|
||||||
|
source={}
|
||||||
|
source['url']=url+library
|
||||||
|
source['id']=id
|
||||||
|
try:
|
||||||
|
tmpbook = load_metadata(dir, uuid)
|
||||||
|
except:
|
||||||
|
print("Unable to get metadata from:", uuid)
|
||||||
|
range+=1
|
||||||
|
continue
|
||||||
|
if tmpbook and tmpbook['source']['status']=="ignored":
|
||||||
|
source['status']="ignored"
|
||||||
|
else:
|
||||||
|
source['status']="todo"
|
||||||
|
source['cover']=server+r_book['cover']
|
||||||
|
source['timestamp']=r_book['timestamp']
|
||||||
|
|
||||||
|
format_sources={}
|
||||||
|
formats=r_book['formats']
|
||||||
|
for f in formats:
|
||||||
|
s={}
|
||||||
|
url=''
|
||||||
|
if f in r_book['main_format']:
|
||||||
|
url=r_book['main_format'][f]
|
||||||
|
else:
|
||||||
|
url=r_book['other_formats'][f]
|
||||||
|
s['url']=server+url
|
||||||
|
|
||||||
|
if 'size' in r_book['format_metadata'][f]:
|
||||||
|
s['size']=int(r_book['format_metadata'][f]['size'])
|
||||||
|
else:
|
||||||
|
print()
|
||||||
|
print("Size not found for format '{}' : {}".format(f, uuid))
|
||||||
|
print("Trying to get size online: {}".format(s['url']))
|
||||||
|
try:
|
||||||
|
s['size']=get_file_size(s['url'])
|
||||||
|
except:
|
||||||
|
print("Unable to access format '{}' : {} skipped".format(f, uuid))
|
||||||
|
continue
|
||||||
|
s['status']='todo'
|
||||||
|
format_sources[f]=s
|
||||||
|
|
||||||
|
source['formats']=format_sources
|
||||||
|
book['source']=source
|
||||||
|
|
||||||
|
|
||||||
|
if not source['formats']:
|
||||||
|
print("No format found for {}".format(r_book['uuid']))
|
||||||
|
range+=1
|
||||||
|
continue
|
||||||
|
update_done_status(book)
|
||||||
|
# print("Saving metadata for:", uuid)
|
||||||
|
try:
|
||||||
|
save_metadata(dir, book)
|
||||||
|
except:
|
||||||
|
print()
|
||||||
|
print("Unable to save book metadata", book['uuid'])
|
||||||
|
range+=1
|
||||||
|
offset=offset+num
|
||||||
|
print()
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
|
||||||
|
def has_languages(book, languages=[], ignore_empty_language=False):
|
||||||
|
|
||||||
|
# print("Accepted languages", languages)
|
||||||
|
if not ignore_empty_language:
|
||||||
|
# print("Unknown language accepted")
|
||||||
|
pass
|
||||||
|
|
||||||
|
# rustine
|
||||||
|
if not 'languages' in book:
|
||||||
|
book['languages']=[]
|
||||||
|
|
||||||
|
# print("Book languages", book['languages'])
|
||||||
|
|
||||||
|
if ignore_empty_language and not book['languages']:
|
||||||
|
# print ("'{}' ignored: language is empty".format(book['uuid']))
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not ignore_empty_language and not book['languages']:
|
||||||
|
# print ("'{}' todo: language is empty".format(book['uuid']))
|
||||||
|
return True
|
||||||
|
|
||||||
|
expected_languages=list(set(book['languages']) & set(languages))
|
||||||
|
if languages and not expected_languages:
|
||||||
|
# print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
|
||||||
|
return False
|
||||||
|
|
||||||
|
# print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
|
||||||
|
return True
|
||||||
|
|
||||||
|
def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):
|
||||||
|
|
||||||
|
# print("Accepted identifiers", identifiers)
|
||||||
|
if not ignore_empty_identifiers:
|
||||||
|
# print("Unknown identifiers accepted")
|
||||||
|
pass
|
||||||
|
# print("Book identifiers", book['identifiers'].keys())
|
||||||
|
|
||||||
|
if ignore_empty_identifiers and not book['identifiers']:
|
||||||
|
# print ("'{}' ignored: identifier is empty".format(book['uuid']))
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not ignore_empty_identifiers and not book['identifiers']:
|
||||||
|
# print ("'{}' todo: identifiers is empty".format(book['uuid']))
|
||||||
|
return True
|
||||||
|
|
||||||
|
expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers))
|
||||||
|
if identifiers and not expected_identifiers:
|
||||||
|
# print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
|
||||||
|
return False
|
||||||
|
|
||||||
|
# print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
|
||||||
|
return True
|
||||||
|
|
||||||
|
def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, timer=0, map="", map_lib=""):
|
||||||
|
'''
|
||||||
|
Download ebooks in matching subdirs:
|
||||||
|
|
||||||
|
The different formats of the same book are groupe in the same directory
|
||||||
|
with an UUID name close to the metadata file (metadata.json).
|
||||||
|
The status of the formats for a book and its global status are initially set to 'todo'.
|
||||||
|
They move to 'done' after their download. This allows you to rerun the download and progressively collect books.
|
||||||
|
You can use different options to filter the formats for the download
|
||||||
|
by language, size, format and identifiers(isbn, ...).
|
||||||
|
A report of the download is displayed at the end of the process.
|
||||||
|
You can run this command in dry mode (--dry-run) with different settings
|
||||||
|
to only display the report and prepare your effective.
|
||||||
|
Params:
|
||||||
|
--min-size=<int> (default=0)
|
||||||
|
--max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats
|
||||||
|
--dry-run (defaul=False) : Run the command to simulate the download
|
||||||
|
--language=<string> : Restrict the download to a list of specific languages
|
||||||
|
(Ex: --languages='["eng","ita"]'
|
||||||
|
--ignore-empty-language (defaul=False) : Ignore books with unidentfied language
|
||||||
|
--formats=<string> : Restrict the download to a list of specific formats
|
||||||
|
(Ex: --formats='["epub", "mobi", "pdf"]'
|
||||||
|
--ignore-formats=<string> : Ignore the formats of a list of specific.
|
||||||
|
Compliant with --formats.
|
||||||
|
(Ex: --ignored-formats='["mp3", "rar", "zip"]'
|
||||||
|
--single-format (defaul=False) : Limit the download to 1 format per book with this preference order
|
||||||
|
'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub',
|
||||||
|
'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar'
|
||||||
|
, 'rtf', 'txt', 'zip', 'fb2'
|
||||||
|
--identifiers=<string> : Restrict the download to a list of specific identifiers
|
||||||
|
(Ex: --identifiers='["isbn","asin"]'
|
||||||
|
--ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR)
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
if single_format: my_formats = formats if formats else all_ordered_formats
|
||||||
|
else: my_formats=formats
|
||||||
|
# print("formats=", my_formats)
|
||||||
|
|
||||||
|
min_size=int(min_size)*1024*1024
|
||||||
|
max_size=int(max_size)*1024*1024
|
||||||
|
print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))
|
||||||
|
|
||||||
|
total_size=0
|
||||||
|
total_size_by_format={}
|
||||||
|
total_ebook_count=0
|
||||||
|
total_format_count=0
|
||||||
|
total_count_by_format={}
|
||||||
|
size_max=0
|
||||||
|
size_min=0
|
||||||
|
language_count={}
|
||||||
|
identifiers_count={}
|
||||||
|
|
||||||
|
s = requests.Session()
|
||||||
|
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(dir, topdown=True):
|
||||||
|
for counter, uuid in enumerate(dirs):
|
||||||
|
book = load_metadata(root, uuid)
|
||||||
|
if book:
|
||||||
|
status=book['source']['status']
|
||||||
|
if status=="todo":
|
||||||
|
|
||||||
|
if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
|
||||||
|
continue
|
||||||
|
|
||||||
|
source=book['source']
|
||||||
|
download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
|
||||||
|
if not len(download_formats):
|
||||||
|
# print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
|
||||||
|
# print()
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
ebook_kept=False
|
||||||
|
for f in download_formats:
|
||||||
|
url = source['formats'][f]['url']
|
||||||
|
# if map:
|
||||||
|
# pu=urllib.parse.urlparse(url)
|
||||||
|
# pu=(pu[0], map, *pu[2:])
|
||||||
|
# print(pu)
|
||||||
|
# print(urllib.parse.urlunparse(pu))
|
||||||
|
if url:
|
||||||
|
# # It shouldn't occur: Need to download again
|
||||||
|
if get_file_path(dir, uuid, f):
|
||||||
|
# print ("Format '{}' already present for {}: Retrying".format(f, uuid))
|
||||||
|
# print()
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))
|
||||||
|
pass
|
||||||
|
|
||||||
|
# print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
|
||||||
|
if not dry_run:
|
||||||
|
try:
|
||||||
|
get_file(dir, book, f, s, map, map_lib)
|
||||||
|
book['formats'].append(f)
|
||||||
|
book['source']['formats'][f]['status']="done"
|
||||||
|
if timer:
|
||||||
|
print(f"Waiting {timer} seconds")
|
||||||
|
time.sleep(timer)
|
||||||
|
except Exception as msg:
|
||||||
|
print("Unable to get book:", url)
|
||||||
|
print(msg)
|
||||||
|
time.sleep(5)
|
||||||
|
continue
|
||||||
|
save_metadata(dir, book)
|
||||||
|
|
||||||
|
ebook_kept=True
|
||||||
|
size=source['formats'][f]['size']
|
||||||
|
total_size += size
|
||||||
|
size_max = size if size>size_max else size_max
|
||||||
|
if not size_min:
|
||||||
|
size_min = size
|
||||||
|
else:
|
||||||
|
size_min = size if size<size_min else size_min
|
||||||
|
|
||||||
|
if not f in total_size_by_format:
|
||||||
|
total_size_by_format[f] = size
|
||||||
|
else: total_size_by_format[f] +=size
|
||||||
|
if not f in total_count_by_format:
|
||||||
|
total_count_by_format[f] = 1
|
||||||
|
else:
|
||||||
|
total_count_by_format[f]+=1
|
||||||
|
total_format_count +=1
|
||||||
|
else:
|
||||||
|
# print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
|
||||||
|
# print()
|
||||||
|
pass
|
||||||
|
if ebook_kept:
|
||||||
|
total_ebook_count+=1
|
||||||
|
if not book['languages']:
|
||||||
|
if not '<unknown>' in language_count:
|
||||||
|
language_count['<unknown>'] = 1
|
||||||
|
else:
|
||||||
|
language_count['<unknown>']+=1
|
||||||
|
else:
|
||||||
|
for l in book['languages']:
|
||||||
|
if not l in language_count:
|
||||||
|
language_count[l] = 1
|
||||||
|
else:
|
||||||
|
language_count[l]+=1
|
||||||
|
if not book['identifiers']:
|
||||||
|
if not '<unknown>' in identifiers_count:
|
||||||
|
identifiers_count['<unknown>'] = 1
|
||||||
|
else:
|
||||||
|
identifiers_count['<unknown>']+=1
|
||||||
|
else:
|
||||||
|
for l in book['identifiers'].keys():
|
||||||
|
if not l in identifiers_count:
|
||||||
|
identifiers_count[l] = 1
|
||||||
|
else:
|
||||||
|
identifiers_count[l]+=1
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
update_done_status(book)
|
||||||
|
if book['source']['status']=="done":
|
||||||
|
save_metadata(dir, book)
|
||||||
|
print("Book done:", book['uuid'])
|
||||||
|
print()
|
||||||
|
# total_ebook_count+=1
|
||||||
|
else:
|
||||||
|
# print()
|
||||||
|
# print("-->", uuid, "("+book['title']+")")
|
||||||
|
# print ('{} in status "{}": skipped'.format(book['uuid'], status))
|
||||||
|
# print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r")
|
||||||
|
# print(f"--> {uuid} ({book['title']})", end="\r")
|
||||||
|
print(f'--> {counter} books handled', end="\r")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Reporting ...")
|
||||||
|
|
||||||
|
table_l = BeautifulTable()
|
||||||
|
table_l.column_headers = ["Language", "Ebooks count"]
|
||||||
|
for l, c in language_count.items():
|
||||||
|
table_l.append_row([l, c])
|
||||||
|
table_l.sort("Ebooks count", reverse=True)
|
||||||
|
table_l=table_l[0:10]
|
||||||
|
|
||||||
|
table_i = BeautifulTable()
|
||||||
|
table_i.column_headers = ["Identifier", "Ebooks count"]
|
||||||
|
for i, c in identifiers_count.items():
|
||||||
|
table_i.append_row([i, c])
|
||||||
|
table_i.sort("Ebooks count", reverse=True)
|
||||||
|
table_i=table_i[0:10]
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Top 10 ebooks by language/identifier:")
|
||||||
|
table = BeautifulTable()
|
||||||
|
table.column_headers = ["Languages", "Identifiers"]
|
||||||
|
table.append_row([table_l, table_i])
|
||||||
|
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
|
||||||
|
print(table)
|
||||||
|
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Total count of ebooks by format:")
|
||||||
|
table = BeautifulTable()
|
||||||
|
table.column_headers = ["Format", "Size", "Ebooks count"]
|
||||||
|
for f in total_count_by_format.keys():
|
||||||
|
table.append_row([f, hsize(total_size_by_format[f]),total_count_by_format[f]])
|
||||||
|
table.sort("Ebooks count", reverse=True)
|
||||||
|
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
|
||||||
|
print(table)
|
||||||
|
|
||||||
|
|
||||||
|
table_c = BeautifulTable()
|
||||||
|
table_c.column_headers = ["", "Total count"]
|
||||||
|
table_c.append_row(["Formats", total_format_count])
|
||||||
|
table_c.append_row(["Ebooks", total_ebook_count])
|
||||||
|
|
||||||
|
table_s = BeautifulTable()
|
||||||
|
table_s.column_headers = ["", "Size"]
|
||||||
|
# table.append_row(["Min", hsize(size_min)])
|
||||||
|
table_s.append_row(["Biggest File", hsize(size_max)])
|
||||||
|
table_s.append_row(["Total", hsize(total_size)])
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Summary:")
|
||||||
|
table = BeautifulTable()
|
||||||
|
table.column_headers = ["Total Count", "Total Size"]
|
||||||
|
table.append_row([table_c, table_s])
|
||||||
|
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
|
||||||
|
print(table)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
|
||||||
|
# print("Accepted formats", accepted_formats)
|
||||||
|
source=book['source']
|
||||||
|
# print("Formats available in source: {}".format(list(source['formats'].keys())))
|
||||||
|
my_formats=[]
|
||||||
|
for f,v in source['formats'].items():
|
||||||
|
if v['status']=='todo':
|
||||||
|
my_formats.append(f)
|
||||||
|
# print("Formats in 'todo': {}".format(my_formats))
|
||||||
|
|
||||||
|
formats=[]
|
||||||
|
if single_format:
|
||||||
|
if accepted_formats:
|
||||||
|
for f in accepted_formats:
|
||||||
|
if f in my_formats:
|
||||||
|
formats=[f]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("need at least 1 format for ordering")
|
||||||
|
else:
|
||||||
|
if accepted_formats:
|
||||||
|
formats=list(set(accepted_formats) & set(my_formats))
|
||||||
|
elif ignored_formats:
|
||||||
|
formats = list(set(my_formats) - set(ignored_formats))
|
||||||
|
else:
|
||||||
|
formats=my_formats
|
||||||
|
|
||||||
|
# print("Formats expected: {}".format(formats))
|
||||||
|
|
||||||
|
download_formats=formats[:]
|
||||||
|
for f in formats:
|
||||||
|
if not 'size' in source['formats'][f] and max_size:
|
||||||
|
# print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
|
||||||
|
download_formats.remove(f)
|
||||||
|
else:
|
||||||
|
size = source['formats'][f]['size']
|
||||||
|
if size < min_size or (max_size and size > max_size):
|
||||||
|
download_formats.remove(f)
|
||||||
|
# print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
|
||||||
|
return download_formats
|
||||||
|
|
||||||
|
|
||||||
|
def update_format_statuses(book,refresh_ignored):
|
||||||
|
formats=book['source']['formats']
|
||||||
|
for f, v in formats.items():
|
||||||
|
if v['status']=='ignored' and not refresh_ignored:
|
||||||
|
# print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
|
||||||
|
book['source']['formats'][f]['status']='todo'
|
||||||
|
|
||||||
|
import glob
|
||||||
|
def check_ebooks(dir= 'my_books', dry_run=True):
|
||||||
|
'''
|
||||||
|
Check ebooks:
|
||||||
|
'''
|
||||||
|
|
||||||
|
print("Checking ...")
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(dir, topdown=True):
|
||||||
|
for counter, uuid in enumerate(dirs):
|
||||||
|
book = load_metadata(root, uuid)
|
||||||
|
if book:
|
||||||
|
status=book['source']['status']
|
||||||
|
if status=="todo":
|
||||||
|
print(status)
|
||||||
|
source=book['source']
|
||||||
|
update=False
|
||||||
|
for f, v in source["formats"].items():
|
||||||
|
print(uuid, f, v['status'])
|
||||||
|
if v['status']=="todo":
|
||||||
|
formats= glob.glob(root+"/"+uuid+"/*."+f)
|
||||||
|
print(formats)
|
||||||
|
if formats:
|
||||||
|
print(book['uuid'], formats[0])
|
||||||
|
book['source']['formats'][f]['status']="done"
|
||||||
|
update=True
|
||||||
|
|
||||||
|
if not dry_run and update:
|
||||||
|
update_done_status(book)
|
||||||
|
save_metadata(dir, book)
|
||||||
|
print("Book done", book['uuid'])
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire({
|
||||||
|
"index_ebooks": index_ebooks,
|
||||||
|
"download_ebooks": download_ebooks,
|
||||||
|
"download_covers": download_covers,
|
||||||
|
"set_status": set_status,
|
||||||
|
"check_ebooks": check_ebooks
|
||||||
|
})
|
Loading…
x
Reference in New Issue
Block a user