add calisuck.py and start howto
This commit is contained in:
parent
68867b6316
commit
36168db750
@ -2,10 +2,10 @@
|
||||
|
||||
Mirror of Krazybug's calibre scripts
|
||||
|
||||
-
|
||||
ToDo
|
||||
## ToDo
|
||||
|
||||
1. Create guide on how to find calibre instances and index them with calisuck
|
||||
2. Howto export as sqlite.db so calishot can use it to act as a search engine of said instances
|
||||
3. ????
|
||||
4. Profit
|
||||
5. How to pull calibre URLs from Shodan.
|
21
calisuck/HowTo.md
Normal file
21
calisuck/HowTo.md
Normal file
@ -0,0 +1,21 @@
|
||||
# Howto
|
||||
|
||||
// You need python 3.5 at a minimum and initialize and activate a venv
|
||||
python -m venv .
|
||||
// Might need to activate the venv manually
|
||||
. bin/activate
|
||||
// Pre-reqs via pip
|
||||
pip install requests fire humanize langid iso639 beautifultable
|
||||
// help commands
|
||||
python calisuck.py --help
|
||||
python calisuck.py index-ebooks --help
|
||||
python calisuck.py download-ebooks --help
|
||||
python calisuck.py download-covers --help
|
||||
|
||||
# Where the hell do I find instances?
|
||||
### Shodan :
|
||||
Apparently searching for "calibre" in shodan gives you thousands of results. Unfortunately you can't filtre without making an account.
|
||||
There has to be a way to automatically pull the URLs from Shodan but it's out of my current expertise.
|
||||
|
||||
###
|
||||
|
819
calisuck/calisuck.py
Normal file
819
calisuck/calisuck.py
Normal file
@ -0,0 +1,819 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
'''
|
||||
calisuck: index, filter-out smartly and download ebooks from Calibre open directories
|
||||
Installation:
|
||||
You need python 3.5 installed
|
||||
Download the file as a zip and unzip-it and get into the dir
|
||||
OR
|
||||
> git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git
|
||||
> mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck
|
||||
> cd calisuck
|
||||
>
|
||||
THEN
|
||||
> python3 -m venv .
|
||||
> . bin/activate
|
||||
> pip install requests fire humanize langid iso639 beautifultable
|
||||
> python calisuck.py --help
|
||||
> python calisuck.py index-ebooks --help
|
||||
> python calisuck.py download-ebooks --help
|
||||
> python calisuck.py download-covers --help
|
||||
'''
|
||||
|
||||
'''
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
Version 2, December 2004
|
||||
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||
Everyone is permitted to copy and distribute verbatim or modified
|
||||
copies of this license document, and changing it is allowed as long
|
||||
as the name is changed.
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
0. You just DO WHAT THE FUCK YOU WANT TO.
|
||||
'''
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
import shutil
|
||||
import requests
|
||||
import json
|
||||
import fire
|
||||
from humanize import naturalsize as hsize
|
||||
from langid.langid import LanguageIdentifier, model
|
||||
import iso639
|
||||
import time
|
||||
from requests.adapters import HTTPAdapter
|
||||
import urllib.parse
|
||||
import urllib3
|
||||
from beautifultable import BeautifulTable
|
||||
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
|
||||
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
|
||||
|
||||
|
||||
def load_metadata(path, uuid):
|
||||
filepath=path+'/'+uuid+'/metadata.json'
|
||||
# print (filepath)
|
||||
if os.path.isfile(filepath):
|
||||
try:
|
||||
with open(filepath, 'r') as fd:
|
||||
return json.load(fd)
|
||||
except:
|
||||
print ("Error loading metadata for:", uuid, "from path:", path)
|
||||
return 0
|
||||
else:
|
||||
# print ("Metadata not found for:", uuid, "from path:", path)
|
||||
return 0
|
||||
|
||||
|
||||
def save_metadata(path, book):
|
||||
filepath=path+'/'+book['uuid']+'/metadata.json'
|
||||
# print("Saving book metadata for:", book['uuid'], "to:", filepath)
|
||||
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
|
||||
with open(filepath+".tmp", 'w') as fd:
|
||||
json.dump(book, fd, indent=4, separators=(',', ': '))
|
||||
try:
|
||||
shutil.move(filepath+".tmp", filepath)
|
||||
# print("Saved to:", filepath)
|
||||
except:
|
||||
print("Unable to rename .tmp file:", filepath+".tmp")
|
||||
|
||||
|
||||
def get_cover_path(path, uuid):
|
||||
filepath=path+'/'+uuid+'/cover.jpg'
|
||||
if os.path.isfile(filepath): return filepath
|
||||
else: return 0
|
||||
|
||||
|
||||
def get_file_path(path, uuid, fileformat):
|
||||
files=os.listdir(path+'/'+uuid)
|
||||
if files:
|
||||
for f in files:
|
||||
fname, ext=os.path.splitext(f)
|
||||
if ext =='.'+fileformat:
|
||||
return path+'/'+uuid+'/'+f
|
||||
else: return 0
|
||||
else: return 0
|
||||
|
||||
|
||||
def get_cover(path, book, map):
|
||||
url=book['source']['cover']
|
||||
if map:
|
||||
pu=urllib.parse.urlparse(url)
|
||||
pu=(pu[0], map, *pu[2:])
|
||||
print(pu)
|
||||
url=urllib.parse.urlunparse(pu)
|
||||
|
||||
print("Downloading cover from:", url)
|
||||
|
||||
r=requests.get(url, timeout=(20, 3), verify=False)
|
||||
r.raise_for_status()
|
||||
|
||||
filepath=path+'/'+book['uuid']+'/cover.jpg'
|
||||
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
|
||||
with open(filepath+".tmp", 'wb') as fd:
|
||||
fd.write(r.content)
|
||||
shutil.move(filepath+".tmp", filepath)
|
||||
print("Saved to:", filepath)
|
||||
|
||||
|
||||
def download_covers(dir='my_books', server='', map=""):
|
||||
""" Download covers for each books"""
|
||||
|
||||
for root, dirs, files in os.walk(dir, topdown=True):
|
||||
for d in dirs:
|
||||
# print()
|
||||
# print("-->", d)
|
||||
book = load_metadata(root, d)
|
||||
if book:
|
||||
# if book['source']['status'] != "ignored":
|
||||
if True:
|
||||
if not get_cover_path(root, book['uuid']):
|
||||
print()
|
||||
print("-->", d)
|
||||
print(book['uuid'])
|
||||
try:
|
||||
get_cover(root, book, map)
|
||||
except:
|
||||
print ("Unable to get cover", book['uuid'])
|
||||
else:
|
||||
pass
|
||||
# print ("Cover already present:", book['uuid'])
|
||||
else:
|
||||
print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
|
||||
else:
|
||||
print ("No ebook metadata found in:", root)
|
||||
|
||||
|
||||
def get_file_size(url):
|
||||
print("Downloading size:", url)
|
||||
r = requests.head(url, verify=False)
|
||||
r.raise_for_status()
|
||||
size=r.headers['Content-Length']
|
||||
print("Size received="+ hsize(size))
|
||||
return int(size)
|
||||
|
||||
|
||||
def get_file(path, book, format, session, map, map_lib):
|
||||
uuid = book['uuid']
|
||||
url=book['source']['formats'][format]['url']
|
||||
if map:
|
||||
pu=urllib.parse.urlparse(url)
|
||||
pu=(pu[0], map, *pu[2:])
|
||||
print(pu)
|
||||
url=urllib.parse.urlunparse(pu)
|
||||
|
||||
if map_lib:
|
||||
# pu=urllib.parse.urlparse(url)
|
||||
# print(pu)
|
||||
url_s=url.split("/")
|
||||
# print(url_s)
|
||||
url_s=url_s[:-1]+[map_lib]
|
||||
# print('/'.join(url_s))
|
||||
url='/'.join(url_s)
|
||||
|
||||
print()
|
||||
print("Downloading ebook:", url)
|
||||
print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
|
||||
r = session.get(url, timeout=(25,15), verify=False)
|
||||
# headers = {"Range": "bytes=0-1023"}
|
||||
# r = requests.get(url, headers=headers)
|
||||
r.raise_for_status()
|
||||
# print(r.headers)
|
||||
if('Content-Length' in r.headers ):
|
||||
print("Size received="+hsize(r.headers['Content-Length']))
|
||||
else:
|
||||
print("Fize received")
|
||||
|
||||
|
||||
filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition'])
|
||||
# print(filename)
|
||||
if len(filename):
|
||||
filepath=path+'/'+uuid+'/'+filename[0]
|
||||
else:
|
||||
filepath=path+'/'+uuid+'/'+uuid+"."+format
|
||||
|
||||
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
|
||||
with open(filepath+".tmp", 'wb') as fd:
|
||||
fd.write(r.content)
|
||||
shutil.move(filepath+".tmp", filepath)
|
||||
print("Saved to:", filepath)
|
||||
|
||||
|
||||
def set_status(uuid, status, dir='.'):
|
||||
book = load_metadata(dir, uuid)
|
||||
if book:
|
||||
if book['source']['status'] != status:
|
||||
book['source']['status'] = status
|
||||
save_metadata(dir, book)
|
||||
print("Status changed to", status+":", book['uuid'], "(", book['title'], ")")
|
||||
else:
|
||||
print("Status unchanged changed ", status+":", book['uuid'])
|
||||
else:
|
||||
print ("No ebook metadata found for:", uuid)
|
||||
|
||||
|
||||
def remove_book(uuid, path='.'):
|
||||
print(os.getcwd())
|
||||
bookpath=path+'/'+uuid
|
||||
if os.path.isdir(bookpath):
|
||||
try:
|
||||
shutil.rmtree(bookpath)
|
||||
print(uuid, "removed")
|
||||
except:
|
||||
print("problem")
|
||||
else:
|
||||
print(uuid, "not found")
|
||||
|
||||
|
||||
def update_done_status(book):
|
||||
source=book['source']
|
||||
if source['status']!='ignored':
|
||||
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
|
||||
book['source']['status']="done"
|
||||
else:
|
||||
book['source']['status']="todo"
|
||||
|
||||
|
||||
def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False):
|
||||
"""
|
||||
Index a remote Calibre library
|
||||
You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book.
|
||||
They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program.
|
||||
They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all
|
||||
the different formats of the same book and eventually the cover file.
|
||||
You can mix books from different sites without any (theoric) collisions
|
||||
Params:
|
||||
--site=<string> : Url of the site to index (ex: http://123.123.123.123/)
|
||||
--library=<string> (default=my_books) : Id of library to index. The script index the default library by default.
|
||||
The id is string following '&library_id=' in the url
|
||||
--force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata
|
||||
already gathered are ignored
|
||||
--start=<int> (default=0)
|
||||
--stop=<int> (default=0) : Allow indexing between a range of ebooks
|
||||
|
||||
--inc=<int> (default=1000) : Fix the number of ebooks for each request one the server
|
||||
"""
|
||||
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
|
||||
offset= 0 if not start else start-1
|
||||
num=min(1000,inc)
|
||||
server=site.rstrip('/')
|
||||
api=server+'/ajax/'
|
||||
library= '/'+library if library else library
|
||||
|
||||
print("Server:", server)
|
||||
url=api+'search'+library+'?num=0'
|
||||
print()
|
||||
print("Getting ebooks count:", server)
|
||||
try:
|
||||
r = requests.get(url,verify=False)
|
||||
r.raise_for_status()
|
||||
except:
|
||||
print("Unable to open site:", url)
|
||||
sys.exit(1)
|
||||
print("Total count=",r.json()["total_num"])
|
||||
total_num=int(r.json()["total_num"])
|
||||
total_num= total_num if not stop else stop
|
||||
|
||||
print()
|
||||
print("Start indexing")
|
||||
|
||||
range=offset+1
|
||||
while offset < total_num:
|
||||
remaining_num = min(num, total_num - offset)
|
||||
# print()
|
||||
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
|
||||
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
|
||||
|
||||
# print("->", url)
|
||||
r=requests.get(url, verify=False)
|
||||
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
|
||||
|
||||
# print()
|
||||
# print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='')
|
||||
books_s=",".join(str(i) for i in r.json()['book_ids'])
|
||||
url=api+'books'+library+'?ids='+books_s
|
||||
# print("->", url)
|
||||
r=requests.get(url, verify=False)
|
||||
# print(len(r.json()), "received")
|
||||
|
||||
for id, r_book in r.json().items():
|
||||
uuid=r_book['uuid']
|
||||
if not uuid:
|
||||
print ("No uuid for ebook: ignored")
|
||||
continue
|
||||
|
||||
if r_book['authors']:
|
||||
desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})"
|
||||
else:
|
||||
desc= f"uuid={uuid} ({r_book['title']})"
|
||||
s=f"\r--> {range}/{total_num} - {desc}"
|
||||
s='{:140.140}'.format(s)
|
||||
print (s, end='')
|
||||
|
||||
if not force_refresh:
|
||||
try:
|
||||
book = load_metadata(dir, uuid)
|
||||
except:
|
||||
print()
|
||||
print("Unable to get metadata from:", uuid)
|
||||
range+=1
|
||||
continue
|
||||
if book:
|
||||
# print("Metadata already present for:", uuid)
|
||||
range+=1
|
||||
continue
|
||||
|
||||
|
||||
if not r_book['formats']:
|
||||
print()
|
||||
print("No format found for {}".format(r_book['uuid']))
|
||||
range+=1
|
||||
continue
|
||||
|
||||
|
||||
book={}
|
||||
url=api+'book/'+id
|
||||
book['title']=r_book['title']
|
||||
book['authors']=r_book['authors']
|
||||
book['series']=r_book['series']
|
||||
book['series_index']=r_book['series_index']
|
||||
book['edition']=0
|
||||
book['uuid']=r_book['uuid']
|
||||
book['identifiers']=r_book['identifiers']
|
||||
book['comments']=r_book['comments']
|
||||
book['pubdate']=r_book['pubdate']
|
||||
book['publisher']=r_book['publisher']
|
||||
languages=r_book['languages']
|
||||
if not languages:
|
||||
# if True:
|
||||
if book['comments']:
|
||||
text=book['comments']
|
||||
else:
|
||||
text=book['title']
|
||||
s_language, prob=identifier.classify(text)
|
||||
if prob >= 0.85:
|
||||
language = iso639.to_iso639_2(s_language)
|
||||
book['languages']=[language]
|
||||
else:
|
||||
book['languages']=[]
|
||||
else:
|
||||
book['languages']=[]
|
||||
for l in languages:
|
||||
book['languages'].append(iso639.to_iso639_2(l))
|
||||
|
||||
book['tags']=r_book['tags']
|
||||
book['formats']=[]
|
||||
book['metadata_version']=0.1
|
||||
source={}
|
||||
source['url']=url+library
|
||||
source['id']=id
|
||||
try:
|
||||
tmpbook = load_metadata(dir, uuid)
|
||||
except:
|
||||
print("Unable to get metadata from:", uuid)
|
||||
range+=1
|
||||
continue
|
||||
if tmpbook and tmpbook['source']['status']=="ignored":
|
||||
source['status']="ignored"
|
||||
else:
|
||||
source['status']="todo"
|
||||
source['cover']=server+r_book['cover']
|
||||
source['timestamp']=r_book['timestamp']
|
||||
|
||||
format_sources={}
|
||||
formats=r_book['formats']
|
||||
for f in formats:
|
||||
s={}
|
||||
url=''
|
||||
if f in r_book['main_format']:
|
||||
url=r_book['main_format'][f]
|
||||
else:
|
||||
url=r_book['other_formats'][f]
|
||||
s['url']=server+url
|
||||
|
||||
if 'size' in r_book['format_metadata'][f]:
|
||||
s['size']=int(r_book['format_metadata'][f]['size'])
|
||||
else:
|
||||
print()
|
||||
print("Size not found for format '{}' : {}".format(f, uuid))
|
||||
print("Trying to get size online: {}".format(s['url']))
|
||||
try:
|
||||
s['size']=get_file_size(s['url'])
|
||||
except:
|
||||
print("Unable to access format '{}' : {} skipped".format(f, uuid))
|
||||
continue
|
||||
s['status']='todo'
|
||||
format_sources[f]=s
|
||||
|
||||
source['formats']=format_sources
|
||||
book['source']=source
|
||||
|
||||
|
||||
if not source['formats']:
|
||||
print("No format found for {}".format(r_book['uuid']))
|
||||
range+=1
|
||||
continue
|
||||
update_done_status(book)
|
||||
# print("Saving metadata for:", uuid)
|
||||
try:
|
||||
save_metadata(dir, book)
|
||||
except:
|
||||
print()
|
||||
print("Unable to save book metadata", book['uuid'])
|
||||
range+=1
|
||||
offset=offset+num
|
||||
print()
|
||||
print("Done")
|
||||
|
||||
|
||||
def has_languages(book, languages=[], ignore_empty_language=False):
|
||||
|
||||
# print("Accepted languages", languages)
|
||||
if not ignore_empty_language:
|
||||
# print("Unknown language accepted")
|
||||
pass
|
||||
|
||||
# rustine
|
||||
if not 'languages' in book:
|
||||
book['languages']=[]
|
||||
|
||||
# print("Book languages", book['languages'])
|
||||
|
||||
if ignore_empty_language and not book['languages']:
|
||||
# print ("'{}' ignored: language is empty".format(book['uuid']))
|
||||
return False
|
||||
|
||||
if not ignore_empty_language and not book['languages']:
|
||||
# print ("'{}' todo: language is empty".format(book['uuid']))
|
||||
return True
|
||||
|
||||
expected_languages=list(set(book['languages']) & set(languages))
|
||||
if languages and not expected_languages:
|
||||
# print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
|
||||
return False
|
||||
|
||||
# print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
|
||||
return True
|
||||
|
||||
def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):
|
||||
|
||||
# print("Accepted identifiers", identifiers)
|
||||
if not ignore_empty_identifiers:
|
||||
# print("Unknown identifiers accepted")
|
||||
pass
|
||||
# print("Book identifiers", book['identifiers'].keys())
|
||||
|
||||
if ignore_empty_identifiers and not book['identifiers']:
|
||||
# print ("'{}' ignored: identifier is empty".format(book['uuid']))
|
||||
return False
|
||||
|
||||
if not ignore_empty_identifiers and not book['identifiers']:
|
||||
# print ("'{}' todo: identifiers is empty".format(book['uuid']))
|
||||
return True
|
||||
|
||||
expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers))
|
||||
if identifiers and not expected_identifiers:
|
||||
# print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
|
||||
return False
|
||||
|
||||
# print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
|
||||
return True
|
||||
|
||||
def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, timer=0, map="", map_lib=""):
|
||||
'''
|
||||
Download ebooks in matching subdirs:
|
||||
|
||||
The different formats of the same book are groupe in the same directory
|
||||
with an UUID name close to the metadata file (metadata.json).
|
||||
The status of the formats for a book and its global status are initially set to 'todo'.
|
||||
They move to 'done' after their download. This allows you to rerun the download and progressively collect books.
|
||||
You can use different options to filter the formats for the download
|
||||
by language, size, format and identifiers(isbn, ...).
|
||||
A report of the download is displayed at the end of the process.
|
||||
You can run this command in dry mode (--dry-run) with different settings
|
||||
to only display the report and prepare your effective.
|
||||
Params:
|
||||
--min-size=<int> (default=0)
|
||||
--max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats
|
||||
--dry-run (defaul=False) : Run the command to simulate the download
|
||||
--language=<string> : Restrict the download to a list of specific languages
|
||||
(Ex: --languages='["eng","ita"]'
|
||||
--ignore-empty-language (defaul=False) : Ignore books with unidentfied language
|
||||
--formats=<string> : Restrict the download to a list of specific formats
|
||||
(Ex: --formats='["epub", "mobi", "pdf"]'
|
||||
--ignore-formats=<string> : Ignore the formats of a list of specific.
|
||||
Compliant with --formats.
|
||||
(Ex: --ignored-formats='["mp3", "rar", "zip"]'
|
||||
--single-format (defaul=False) : Limit the download to 1 format per book with this preference order
|
||||
'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub',
|
||||
'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar'
|
||||
, 'rtf', 'txt', 'zip', 'fb2'
|
||||
--identifiers=<string> : Restrict the download to a list of specific identifiers
|
||||
(Ex: --identifiers='["isbn","asin"]'
|
||||
--ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR)
|
||||
'''
|
||||
|
||||
|
||||
|
||||
# all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']
|
||||
|
||||
print()
|
||||
|
||||
if single_format: my_formats = formats if formats else all_ordered_formats
|
||||
else: my_formats=formats
|
||||
# print("formats=", my_formats)
|
||||
|
||||
min_size=int(min_size)*1024*1024
|
||||
max_size=int(max_size)*1024*1024
|
||||
print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))
|
||||
|
||||
total_size=0
|
||||
total_size_by_format={}
|
||||
total_ebook_count=0
|
||||
total_format_count=0
|
||||
total_count_by_format={}
|
||||
size_max=0
|
||||
size_min=0
|
||||
language_count={}
|
||||
identifiers_count={}
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
|
||||
for root, dirs, files in os.walk(dir, topdown=True):
|
||||
for counter, uuid in enumerate(dirs):
|
||||
book = load_metadata(root, uuid)
|
||||
if book:
|
||||
status=book['source']['status']
|
||||
if status=="todo":
|
||||
|
||||
if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
|
||||
continue
|
||||
|
||||
if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
|
||||
continue
|
||||
|
||||
source=book['source']
|
||||
download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
|
||||
if not len(download_formats):
|
||||
# print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
|
||||
# print()
|
||||
pass
|
||||
else:
|
||||
ebook_kept=False
|
||||
for f in download_formats:
|
||||
url = source['formats'][f]['url']
|
||||
# if map:
|
||||
# pu=urllib.parse.urlparse(url)
|
||||
# pu=(pu[0], map, *pu[2:])
|
||||
# print(pu)
|
||||
# print(urllib.parse.urlunparse(pu))
|
||||
if url:
|
||||
# # It shouldn't occur: Need to download again
|
||||
if get_file_path(dir, uuid, f):
|
||||
# print ("Format '{}' already present for {}: Retrying".format(f, uuid))
|
||||
# print()
|
||||
# continue
|
||||
|
||||
# print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))
|
||||
pass
|
||||
|
||||
# print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
|
||||
if not dry_run:
|
||||
try:
|
||||
get_file(dir, book, f, s, map, map_lib)
|
||||
book['formats'].append(f)
|
||||
book['source']['formats'][f]['status']="done"
|
||||
if timer:
|
||||
print(f"Waiting {timer} seconds")
|
||||
time.sleep(timer)
|
||||
except Exception as msg:
|
||||
print("Unable to get book:", url)
|
||||
print(msg)
|
||||
time.sleep(5)
|
||||
continue
|
||||
save_metadata(dir, book)
|
||||
|
||||
ebook_kept=True
|
||||
size=source['formats'][f]['size']
|
||||
total_size += size
|
||||
size_max = size if size>size_max else size_max
|
||||
if not size_min:
|
||||
size_min = size
|
||||
else:
|
||||
size_min = size if size<size_min else size_min
|
||||
|
||||
if not f in total_size_by_format:
|
||||
total_size_by_format[f] = size
|
||||
else: total_size_by_format[f] +=size
|
||||
if not f in total_count_by_format:
|
||||
total_count_by_format[f] = 1
|
||||
else:
|
||||
total_count_by_format[f]+=1
|
||||
total_format_count +=1
|
||||
else:
|
||||
# print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
|
||||
# print()
|
||||
pass
|
||||
if ebook_kept:
|
||||
total_ebook_count+=1
|
||||
if not book['languages']:
|
||||
if not '<unknown>' in language_count:
|
||||
language_count['<unknown>'] = 1
|
||||
else:
|
||||
language_count['<unknown>']+=1
|
||||
else:
|
||||
for l in book['languages']:
|
||||
if not l in language_count:
|
||||
language_count[l] = 1
|
||||
else:
|
||||
language_count[l]+=1
|
||||
if not book['identifiers']:
|
||||
if not '<unknown>' in identifiers_count:
|
||||
identifiers_count['<unknown>'] = 1
|
||||
else:
|
||||
identifiers_count['<unknown>']+=1
|
||||
else:
|
||||
for l in book['identifiers'].keys():
|
||||
if not l in identifiers_count:
|
||||
identifiers_count[l] = 1
|
||||
else:
|
||||
identifiers_count[l]+=1
|
||||
|
||||
if not dry_run:
|
||||
update_done_status(book)
|
||||
if book['source']['status']=="done":
|
||||
save_metadata(dir, book)
|
||||
print("Book done:", book['uuid'])
|
||||
print()
|
||||
# total_ebook_count+=1
|
||||
else:
|
||||
# print()
|
||||
# print("-->", uuid, "("+book['title']+")")
|
||||
# print ('{} in status "{}": skipped'.format(book['uuid'], status))
|
||||
# print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r")
|
||||
# print(f"--> {uuid} ({book['title']})", end="\r")
|
||||
print(f'--> {counter} books handled', end="\r")
|
||||
|
||||
print()
|
||||
print("Reporting ...")
|
||||
|
||||
table_l = BeautifulTable()
|
||||
table_l.column_headers = ["Language", "Ebooks count"]
|
||||
for l, c in language_count.items():
|
||||
table_l.append_row([l, c])
|
||||
table_l.sort("Ebooks count", reverse=True)
|
||||
table_l=table_l[0:10]
|
||||
|
||||
table_i = BeautifulTable()
|
||||
table_i.column_headers = ["Identifier", "Ebooks count"]
|
||||
for i, c in identifiers_count.items():
|
||||
table_i.append_row([i, c])
|
||||
table_i.sort("Ebooks count", reverse=True)
|
||||
table_i=table_i[0:10]
|
||||
|
||||
print()
|
||||
print("Top 10 ebooks by language/identifier:")
|
||||
table = BeautifulTable()
|
||||
table.column_headers = ["Languages", "Identifiers"]
|
||||
table.append_row([table_l, table_i])
|
||||
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
|
||||
print(table)
|
||||
|
||||
|
||||
print()
|
||||
print("Total count of ebooks by format:")
|
||||
table = BeautifulTable()
|
||||
table.column_headers = ["Format", "Size", "Ebooks count"]
|
||||
for f in total_count_by_format.keys():
|
||||
table.append_row([f, hsize(total_size_by_format[f]),total_count_by_format[f]])
|
||||
table.sort("Ebooks count", reverse=True)
|
||||
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
|
||||
print(table)
|
||||
|
||||
|
||||
table_c = BeautifulTable()
|
||||
table_c.column_headers = ["", "Total count"]
|
||||
table_c.append_row(["Formats", total_format_count])
|
||||
table_c.append_row(["Ebooks", total_ebook_count])
|
||||
|
||||
table_s = BeautifulTable()
|
||||
table_s.column_headers = ["", "Size"]
|
||||
# table.append_row(["Min", hsize(size_min)])
|
||||
table_s.append_row(["Biggest File", hsize(size_max)])
|
||||
table_s.append_row(["Total", hsize(total_size)])
|
||||
|
||||
print()
|
||||
print("Summary:")
|
||||
table = BeautifulTable()
|
||||
table.column_headers = ["Total Count", "Total Size"]
|
||||
table.append_row([table_c, table_s])
|
||||
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
|
||||
print(table)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
|
||||
# print("Accepted formats", accepted_formats)
|
||||
source=book['source']
|
||||
# print("Formats available in source: {}".format(list(source['formats'].keys())))
|
||||
my_formats=[]
|
||||
for f,v in source['formats'].items():
|
||||
if v['status']=='todo':
|
||||
my_formats.append(f)
|
||||
# print("Formats in 'todo': {}".format(my_formats))
|
||||
|
||||
formats=[]
|
||||
if single_format:
|
||||
if accepted_formats:
|
||||
for f in accepted_formats:
|
||||
if f in my_formats:
|
||||
formats=[f]
|
||||
break
|
||||
else:
|
||||
print("need at least 1 format for ordering")
|
||||
else:
|
||||
if accepted_formats:
|
||||
formats=list(set(accepted_formats) & set(my_formats))
|
||||
elif ignored_formats:
|
||||
formats = list(set(my_formats) - set(ignored_formats))
|
||||
else:
|
||||
formats=my_formats
|
||||
|
||||
# print("Formats expected: {}".format(formats))
|
||||
|
||||
download_formats=formats[:]
|
||||
for f in formats:
|
||||
if not 'size' in source['formats'][f] and max_size:
|
||||
# print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
|
||||
download_formats.remove(f)
|
||||
else:
|
||||
size = source['formats'][f]['size']
|
||||
if size < min_size or (max_size and size > max_size):
|
||||
download_formats.remove(f)
|
||||
# print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
|
||||
return download_formats
|
||||
|
||||
|
||||
def update_format_statuses(book,refresh_ignored):
|
||||
formats=book['source']['formats']
|
||||
for f, v in formats.items():
|
||||
if v['status']=='ignored' and not refresh_ignored:
|
||||
# print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
|
||||
pass
|
||||
else:
|
||||
# print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
|
||||
book['source']['formats'][f]['status']='todo'
|
||||
|
||||
import glob
|
||||
def check_ebooks(dir= 'my_books', dry_run=True):
|
||||
'''
|
||||
Check ebooks:
|
||||
'''
|
||||
|
||||
print("Checking ...")
|
||||
|
||||
for root, dirs, files in os.walk(dir, topdown=True):
|
||||
for counter, uuid in enumerate(dirs):
|
||||
book = load_metadata(root, uuid)
|
||||
if book:
|
||||
status=book['source']['status']
|
||||
if status=="todo":
|
||||
print(status)
|
||||
source=book['source']
|
||||
update=False
|
||||
for f, v in source["formats"].items():
|
||||
print(uuid, f, v['status'])
|
||||
if v['status']=="todo":
|
||||
formats= glob.glob(root+"/"+uuid+"/*."+f)
|
||||
print(formats)
|
||||
if formats:
|
||||
print(book['uuid'], formats[0])
|
||||
book['source']['formats'][f]['status']="done"
|
||||
update=True
|
||||
|
||||
if not dry_run and update:
|
||||
update_done_status(book)
|
||||
save_metadata(dir, book)
|
||||
print("Book done", book['uuid'])
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire({
|
||||
"index_ebooks": index_ebooks,
|
||||
"download_ebooks": download_ebooks,
|
||||
"download_covers": download_covers,
|
||||
"set_status": set_status,
|
||||
"check_ebooks": check_ebooks
|
||||
})
|
Loading…
Reference in New Issue
Block a user