add calisuck.py and start howto

2023-08-28 22:38:14 -04:00 · 2023-08-28 22:38:14 -04:00 · 36168db750
commit 36168db750
parent 68867b6316
3 changed files with 842 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -2,10 +2,10 @@

 Mirror of Krazybug's calibre scripts

-
-ToDo 
+## ToDo 

 1. Create guide on how to find calibre instances and index them with calisuck
 2. Howto export as sqlite.db so calishot can use it to act as a search engine of said instances
 3. ???? 
 4. Profit
+5. How to pull calibre URLs from Shodan. 
--- a/calisuck/HowTo.md
+++ b/calisuck/HowTo.md
@ -0,0 +1,21 @@
+# Howto
+
+    // You need python 3.5 at a minimum and initialize and activate a venv
+    python -m venv .
+    // Might need to activate the venv manually
+    . bin/activate
+    // Pre-reqs via pip 
+    pip install requests fire humanize langid iso639 beautifultable
+    // help commands
+    python calisuck.py --help
+    python calisuck.py index-ebooks --help
+    python calisuck.py download-ebooks --help
+    python calisuck.py download-covers --help
+
+# Where the hell do I find instances? 
+### Shodan : 
+Apparently searching for "calibre" in shodan gives you thousands of results. Unfortunately you can't filtre without making an account. 
+There has to be a way to automatically pull the URLs from Shodan but it's out of my current expertise. 
+
+### 
+
--- a/calisuck/calisuck.py
+++ b/calisuck/calisuck.py
@ -0,0 +1,819 @@
+#!/usr/bin/env python3
+
+'''
+calisuck: index, filter-out smartly and download ebooks from Calibre open directories
+Installation:
+    You need python 3.5 installed
+    Download the file as a zip and unzip-it and get into the dir
+    OR
+    > git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git
+    > mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck
+    > cd calisuck
+    > 
+    THEN
+    > python3 -m venv .
+    > . bin/activate
+    > pip install requests fire humanize langid iso639 beautifultable
+    > python calisuck.py --help
+    > python calisuck.py index-ebooks --help
+    > python calisuck.py download-ebooks --help
+    > python calisuck.py download-covers --help
+'''
+
+'''
+   DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 
+                    Version 2, December 2004 
+ Copyright (C) 2004 Sam Hocevar <sam@hocevar.net> 
+ Everyone is permitted to copy and distribute verbatim or modified 
+ copies of this license document, and changing it is allowed as long 
+ as the name is changed. 
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 
+  0. You just DO WHAT THE FUCK YOU WANT TO.
+'''
+
+import sys
+import os
+import time
+import re
+import shutil
+import requests
+import json
+import fire
+from humanize import naturalsize as hsize
+from langid.langid import LanguageIdentifier, model
+import iso639
+import time
+from requests.adapters import HTTPAdapter
+import urllib.parse
+import urllib3
+from beautifultable import BeautifulTable
+
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']        
+identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
+
+
+def load_metadata(path, uuid):
+    filepath=path+'/'+uuid+'/metadata.json'
+    # print (filepath)
+    if os.path.isfile(filepath):
+        try:
+            with open(filepath, 'r') as fd:
+                return json.load(fd)
+        except:
+            print ("Error loading metadata for:", uuid, "from path:", path)
+            return 0
+    else:
+        # print ("Metadata not found for:", uuid, "from path:", path)
+        return 0        
+
+
+def save_metadata(path, book):
+    filepath=path+'/'+book['uuid']+'/metadata.json'
+    # print("Saving book metadata for:", book['uuid'], "to:", filepath)
+    os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
+    with open(filepath+".tmp", 'w') as fd:
+        json.dump(book, fd, indent=4, separators=(',', ': '))
+    try:
+        shutil.move(filepath+".tmp", filepath)
+        # print("Saved to:", filepath)
+    except:
+        print("Unable to rename .tmp file:", filepath+".tmp")
+
+
+def get_cover_path(path, uuid):
+    filepath=path+'/'+uuid+'/cover.jpg'
+    if os.path.isfile(filepath): return filepath
+    else: return 0
+
+
+def get_file_path(path, uuid, fileformat):
+    files=os.listdir(path+'/'+uuid)
+    if files:
+        for f in files:
+            fname, ext=os.path.splitext(f)
+            if ext =='.'+fileformat:
+                return path+'/'+uuid+'/'+f
+        else: return 0
+    else: return 0
+
+
+def get_cover(path, book, map):
+    url=book['source']['cover']
+    if map:
+        pu=urllib.parse.urlparse(url)
+        pu=(pu[0], map, *pu[2:])
+        print(pu)
+        url=urllib.parse.urlunparse(pu)
+
+    print("Downloading cover from:", url)
+
+    r=requests.get(url, timeout=(20, 3), verify=False)
+    r.raise_for_status()
+
+    filepath=path+'/'+book['uuid']+'/cover.jpg'
+    os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
+    with open(filepath+".tmp", 'wb') as fd:
+        fd.write(r.content)
+        shutil.move(filepath+".tmp", filepath)
+        print("Saved to:", filepath)
+
+
+def download_covers(dir='my_books', server='', map=""):
+    """ Download covers for each books"""
+
+    for root, dirs, files in os.walk(dir, topdown=True):
+        for d in dirs:
+            # print() 
+            # print("-->", d) 
+            book = load_metadata(root, d)
+            if book:
+                # if book['source']['status'] != "ignored":
+                if True:
+                    if not get_cover_path(root, book['uuid']): 
+                        print() 
+                        print("-->", d) 
+                        print(book['uuid'])
+                        try:
+                            get_cover(root, book, map)
+                        except:
+                            print ("Unable to get cover", book['uuid'])   
+                    else:
+                        pass
+                        # print ("Cover already present:", book['uuid'])
+                else:
+                    print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
+            else:
+                print ("No ebook metadata found in:", root)
+
+
+def get_file_size(url):
+    print("Downloading size:", url)
+    r = requests.head(url, verify=False)
+    r.raise_for_status()
+    size=r.headers['Content-Length']
+    print("Size received="+ hsize(size))
+    return int(size)
+
+
+def get_file(path, book, format, session, map, map_lib):
+    uuid = book['uuid']
+    url=book['source']['formats'][format]['url']
+    if map:
+        pu=urllib.parse.urlparse(url)
+        pu=(pu[0], map, *pu[2:])
+        print(pu)
+        url=urllib.parse.urlunparse(pu)
+
+    if map_lib:
+        # pu=urllib.parse.urlparse(url)
+        # print(pu)
+        url_s=url.split("/")
+        # print(url_s)
+        url_s=url_s[:-1]+[map_lib] 
+        # print('/'.join(url_s))
+        url='/'.join(url_s)
+
+    print()
+    print("Downloading ebook:", url)
+    print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
+    r = session.get(url, timeout=(25,15), verify=False)
+    # headers = {"Range": "bytes=0-1023"}
+    # r = requests.get(url, headers=headers)
+    r.raise_for_status()
+    # print(r.headers)
+    if('Content-Length' in r.headers ): 
+        print("Size received="+hsize(r.headers['Content-Length']))
+    else:
+        print("Fize received")
+
+
+    filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition'])
+    # print(filename)
+    if len(filename):
+        filepath=path+'/'+uuid+'/'+filename[0]
+    else:
+        filepath=path+'/'+uuid+'/'+uuid+"."+format
+
+    os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
+    with open(filepath+".tmp", 'wb') as fd:
+        fd.write(r.content)
+        shutil.move(filepath+".tmp", filepath)
+        print("Saved to:", filepath)
+
+
+def set_status(uuid, status, dir='.'):
+    book = load_metadata(dir, uuid)
+    if book:
+        if book['source']['status'] != status: 
+            book['source']['status'] = status
+            save_metadata(dir, book)
+            print("Status changed to", status+":", book['uuid'], "(", book['title'], ")")
+        else:
+            print("Status unchanged changed ", status+":", book['uuid'])
+    else:
+        print ("No ebook metadata found for:", uuid)
+    
+
+def remove_book(uuid, path='.'):
+    print(os.getcwd())
+    bookpath=path+'/'+uuid
+    if os.path.isdir(bookpath): 
+        try:
+            shutil.rmtree(bookpath)
+            print(uuid, "removed")
+        except:
+            print("problem")
+    else:
+        print(uuid, "not found")
+
+
+def update_done_status(book):
+    source=book['source']
+    if source['status']!='ignored':
+        if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
+            book['source']['status']="done"
+        else: 
+            book['source']['status']="todo"
+
+
+def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False):
+    """
+    Index a remote Calibre library
+    You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book. 
+    They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program.
+    They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all 
+    the different formats of the same book and eventually the cover file.
+    You can mix books from different sites without any (theoric) collisions  
+    Params:
+    --site=<string>                         :   Url of the site to index (ex: http://123.123.123.123/)
+    --library=<string>  (default=my_books)  :   Id of library to index. The script index the default library by default.
+                                                The id is string following '&library_id=' in the url
+    --force-refresh     (defaul=False)      :   Force a refresh of the metadata. By default all the metdata 
+                                                already gathered are ignored
+    --start=<int>       (default=0)
+    --stop=<int>        (default=0)         :   Allow indexing between a range of ebooks
+    
+    --inc=<int>         (default=1000)      :   Fix the number of ebooks for each request one the server   
+    """
+
+    os.makedirs(dir, exist_ok=True)
+
+    offset= 0 if not start else start-1
+    num=min(1000,inc)
+    server=site.rstrip('/')
+    api=server+'/ajax/'
+    library= '/'+library if library else library 
+    
+    print("Server:", server)
+    url=api+'search'+library+'?num=0'
+    print()
+    print("Getting ebooks count:", server)
+    try:
+        r = requests.get(url,verify=False)
+        r.raise_for_status()
+    except:
+        print("Unable to open site:", url)
+        sys.exit(1)
+    print("Total count=",r.json()["total_num"])
+    total_num=int(r.json()["total_num"])
+    total_num= total_num if not stop else stop
+ 
+    print()
+    print("Start indexing")
+    
+    range=offset+1
+    while offset < total_num:
+        remaining_num = min(num, total_num - offset)
+        # print()
+        # print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
+        url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
+
+        # print("->", url)
+        r=requests.get(url, verify=False)
+        # print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
+        
+        # print()
+        # print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='')
+        books_s=",".join(str(i) for i in r.json()['book_ids'])
+        url=api+'books'+library+'?ids='+books_s
+        # print("->", url)
+        r=requests.get(url, verify=False)
+        # print(len(r.json()), "received")
+
+        for id, r_book in r.json().items():                
+                uuid=r_book['uuid']
+                if not uuid:
+                    print ("No uuid for ebook: ignored")
+                    continue 
+
+                if r_book['authors']:
+                    desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})"
+                else:
+                    desc= f"uuid={uuid} ({r_book['title']})"
+                s=f"\r--> {range}/{total_num} - {desc}"
+                s='{:140.140}'.format(s)
+                print (s, end='')
+
+                if not force_refresh:
+                    try:
+                        book = load_metadata(dir, uuid)
+                    except:
+                        print()
+                        print("Unable to get metadata from:", uuid)
+                        range+=1
+                        continue
+                    if book:
+                        # print("Metadata already present for:", uuid)
+                        range+=1
+                        continue
+
+
+                if not r_book['formats']:
+                    print()
+                    print("No format found for {}".format(r_book['uuid']))
+                    range+=1
+                    continue
+
+
+                book={}
+                url=api+'book/'+id
+                book['title']=r_book['title']
+                book['authors']=r_book['authors']
+                book['series']=r_book['series']
+                book['series_index']=r_book['series_index']
+                book['edition']=0
+                book['uuid']=r_book['uuid']
+                book['identifiers']=r_book['identifiers']
+                book['comments']=r_book['comments']
+                book['pubdate']=r_book['pubdate']
+                book['publisher']=r_book['publisher']
+                languages=r_book['languages']
+                if not languages:
+                # if True:
+                    if book['comments']:
+                        text=book['comments']
+                    else:
+                        text=book['title']
+                    s_language, prob=identifier.classify(text)
+                    if prob >= 0.85:
+                        language =  iso639.to_iso639_2(s_language)
+                        book['languages']=[language]
+                    else:
+                        book['languages']=[]
+                else:
+                    book['languages']=[]
+                    for l in languages:
+                        book['languages'].append(iso639.to_iso639_2(l))
+
+                book['tags']=r_book['tags']
+                book['formats']=[]
+                book['metadata_version']=0.1
+                source={}
+                source['url']=url+library
+                source['id']=id
+                try:
+                    tmpbook = load_metadata(dir, uuid)
+                except:
+                    print("Unable to get metadata from:", uuid)
+                    range+=1
+                    continue
+                if tmpbook and tmpbook['source']['status']=="ignored":
+                    source['status']="ignored"
+                else:
+                    source['status']="todo"
+                source['cover']=server+r_book['cover']  
+                source['timestamp']=r_book['timestamp']
+
+                format_sources={}
+                formats=r_book['formats']
+                for f in formats:
+                    s={}    
+                    url=''
+                    if f in r_book['main_format']:
+                        url=r_book['main_format'][f]
+                    else:
+                        url=r_book['other_formats'][f]
+                    s['url']=server+url
+                    
+                    if 'size' in r_book['format_metadata'][f]:
+                        s['size']=int(r_book['format_metadata'][f]['size'])
+                    else:
+                        print()
+                        print("Size not found for format '{}' : {}".format(f, uuid))
+                        print("Trying to get size online: {}".format(s['url']))
+                        try:
+                            s['size']=get_file_size(s['url'])
+                        except:
+                            print("Unable to access format '{}' : {} skipped".format(f, uuid))
+                            continue
+                    s['status']='todo'
+                    format_sources[f]=s
+
+                source['formats']=format_sources
+                book['source']=source
+
+
+                if not source['formats']:
+                    print("No format found for {}".format(r_book['uuid']))
+                    range+=1
+                    continue
+                update_done_status(book)
+                # print("Saving metadata for:", uuid)
+                try:
+                    save_metadata(dir, book)
+                except:
+                    print()
+                    print("Unable to save book metadata", book['uuid'])
+                range+=1
+        offset=offset+num
+    print()
+    print("Done")
+
+
+def has_languages(book, languages=[], ignore_empty_language=False):
+
+    # print("Accepted languages", languages)
+    if not ignore_empty_language:
+            # print("Unknown language accepted")
+            pass
+
+    # rustine
+    if not 'languages' in book:        
+        book['languages']=[]
+
+    # print("Book languages", book['languages'])
+    
+    if ignore_empty_language and not book['languages']:
+        # print ("'{}' ignored: language is empty".format(book['uuid']))
+        return False
+    
+    if not ignore_empty_language and not book['languages']:
+        # print ("'{}' todo: language is empty".format(book['uuid']))
+        return True
+
+    expected_languages=list(set(book['languages']) & set(languages))
+    if languages and not expected_languages:
+        # print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
+        return False
+
+    # print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
+    return True
+
+def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):
+
+    # print("Accepted identifiers", identifiers)
+    if not ignore_empty_identifiers:
+            # print("Unknown identifiers accepted")
+            pass
+    # print("Book identifiers", book['identifiers'].keys())
+    
+    if ignore_empty_identifiers and not book['identifiers']:
+        # print ("'{}' ignored: identifier is empty".format(book['uuid']))
+        return False
+    
+    if not ignore_empty_identifiers and not book['identifiers']:
+        # print ("'{}' todo: identifiers is empty".format(book['uuid']))
+        return True
+
+    expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers))
+    if identifiers and not expected_identifiers:
+        # print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
+        return False
+
+    # print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
+    return True
+
+def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, timer=0, map="", map_lib=""):
+    '''
+    Download ebooks in matching subdirs:
+    
+    The different formats of the same book are groupe in the same directory 
+    with an UUID name close to the metadata file (metadata.json). 
+    The status of the formats for a book and its global status are initially set to 'todo'.
+    They move to 'done' after their download. This allows you to rerun the download and progressively collect books. 
+    You can use different options to filter the formats for the download 
+    by language, size, format and identifiers(isbn, ...).  
+    A report of the download is displayed at the end of the process. 
+    You can run this command in dry mode (--dry-run) with different settings 
+    to only display the report and prepare your effective.  
+    Params:
+    --min-size=<int>            (default=0)
+    --max-size=<int>            (default=infinity)  :   Delimit the size in MB for the accepted formats    
+    --dry-run                   (defaul=False)      :   Run the command to simulate the download   
+    --language=<string>                             :   Restrict the download to a list of specific languages 
+                                                        (Ex: --languages='["eng","ita"]'
+    --ignore-empty-language     (defaul=False)      :   Ignore books with unidentfied language   
+    --formats=<string>                              :   Restrict the download to a list of specific formats 
+                                                        (Ex: --formats='["epub", "mobi", "pdf"]'
+    --ignore-formats=<string>                       :   Ignore the formats of a list of specific. 
+                                                        Compliant with --formats.
+                                                        (Ex: --ignored-formats='["mp3", "rar", "zip"]'
+    --single-format             (defaul=False)      :   Limit the download to 1 format per book with this preference order
+                                                        'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub',
+                                                        'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar'
+                                                        , 'rtf', 'txt', 'zip', 'fb2'
+    --identifiers=<string>                          :   Restrict the download to a list of specific identifiers 
+                                                        (Ex: --identifiers='["isbn","asin"]'
+    --ignore-empty-identifiers  (defaul=False)      :   Ignore books without identifiers (often OCR)   
+    '''
+
+    
+    
+    # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']        
+
+    print()
+
+    if single_format: my_formats = formats if formats else all_ordered_formats 
+    else: my_formats=formats
+    # print("formats=", my_formats)
+
+    min_size=int(min_size)*1024*1024
+    max_size=int(max_size)*1024*1024
+    print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))
+
+    total_size=0
+    total_size_by_format={}
+    total_ebook_count=0
+    total_format_count=0
+    total_count_by_format={}
+    size_max=0
+    size_min=0
+    language_count={}
+    identifiers_count={}
+
+    s = requests.Session()
+    
+
+    for root, dirs, files in os.walk(dir, topdown=True):
+        for counter, uuid in enumerate(dirs):
+            book = load_metadata(root, uuid)
+            if book:
+                status=book['source']['status']
+                if status=="todo":
+
+                    if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
+                        continue
+
+                    if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
+                        continue
+
+                    source=book['source']
+                    download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
+                    if not len(download_formats):
+                        # print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
+                        # print()
+                        pass
+                    else:
+                        ebook_kept=False
+                        for f in download_formats:
+                            url = source['formats'][f]['url']
+                            # if map:
+                            #     pu=urllib.parse.urlparse(url)
+                            #     pu=(pu[0], map, *pu[2:])
+                            #     print(pu)
+                            #     print(urllib.parse.urlunparse(pu))
+                            if url:
+                                # # It shouldn't occur: Need to download again
+                                if get_file_path(dir, uuid, f):
+                                    # print ("Format '{}' already present for {}: Retrying".format(f, uuid))
+                                    # print()
+                                #     continue
+
+                                # print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))
+                                    pass
+
+                                # print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
+                                if not dry_run:    
+                                    try:
+                                        get_file(dir, book, f, s, map, map_lib)
+                                        book['formats'].append(f)
+                                        book['source']['formats'][f]['status']="done"
+                                        if timer:
+                                            print(f"Waiting {timer} seconds")
+                                        time.sleep(timer)
+                                    except Exception as msg:
+                                        print("Unable to get book:", url)
+                                        print(msg)
+                                        time.sleep(5)
+                                        continue
+                                    save_metadata(dir, book)
+
+                                ebook_kept=True
+                                size=source['formats'][f]['size']
+                                total_size += size
+                                size_max = size if size>size_max else size_max
+                                if not size_min: 
+                                    size_min = size
+                                else: 
+                                    size_min = size if size<size_min else size_min
+
+                                if not f in total_size_by_format:
+                                    total_size_by_format[f] = size 
+                                else: total_size_by_format[f] +=size
+                                if not f in total_count_by_format:
+                                    total_count_by_format[f] = 1 
+                                else: 
+                                    total_count_by_format[f]+=1
+                                total_format_count +=1
+                            else:    
+                                # print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
+                                # print()
+                                pass
+                        if ebook_kept:
+                            total_ebook_count+=1
+                            if not book['languages']:
+                                if not '<unknown>' in language_count:
+                                    language_count['<unknown>'] = 1 
+                                else:
+                                    language_count['<unknown>']+=1
+                            else:
+                                for l in book['languages']:
+                                    if not l in language_count:
+                                        language_count[l] = 1 
+                                    else:
+                                        language_count[l]+=1
+                            if not book['identifiers']:
+                                if not '<unknown>' in identifiers_count:
+                                    identifiers_count['<unknown>'] = 1 
+                                else:
+                                    identifiers_count['<unknown>']+=1
+                            else:
+                                for l in book['identifiers'].keys():
+                                    if not l in identifiers_count:
+                                        identifiers_count[l] = 1 
+                                    else:
+                                        identifiers_count[l]+=1
+
+                    if not dry_run:
+                        update_done_status(book)
+                        if book['source']['status']=="done":
+                            save_metadata(dir, book)
+                            print("Book done:", book['uuid'])
+                            print()
+                    # total_ebook_count+=1
+                else:
+                    # print()
+                    # print("-->", uuid, "("+book['title']+")")
+                    # print ('{} in status "{}": skipped'.format(book['uuid'], status))
+                    # print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r")
+                    # print(f"--> {uuid} ({book['title']})", end="\r")
+                    print(f'--> {counter} books handled', end="\r")
+                    
+    print()
+    print("Reporting ...")
+
+    table_l = BeautifulTable()
+    table_l.column_headers = ["Language", "Ebooks count"]
+    for l, c in language_count.items():
+        table_l.append_row([l, c])
+    table_l.sort("Ebooks count", reverse=True)
+    table_l=table_l[0:10]
+
+    table_i = BeautifulTable()
+    table_i.column_headers = ["Identifier", "Ebooks count"]
+    for i, c in identifiers_count.items():
+        table_i.append_row([i, c])
+    table_i.sort("Ebooks count", reverse=True)
+    table_i=table_i[0:10]
+
+    print()
+    print("Top 10 ebooks by language/identifier:")
+    table = BeautifulTable()
+    table.column_headers = ["Languages", "Identifiers"]
+    table.append_row([table_l, table_i])
+    # table.set_style(BeautifulTable.STYLE_MARKDOWN)
+    print(table)
+
+
+    print()
+    print("Total count of ebooks by format:")
+    table = BeautifulTable()
+    table.column_headers = ["Format", "Size", "Ebooks count"]
+    for f in total_count_by_format.keys():
+        table.append_row([f, hsize(total_size_by_format[f]),total_count_by_format[f]])
+    table.sort("Ebooks count", reverse=True)
+    # table.set_style(BeautifulTable.STYLE_MARKDOWN)
+    print(table)
+   
+
+    table_c = BeautifulTable()
+    table_c.column_headers = ["", "Total count"]
+    table_c.append_row(["Formats", total_format_count])
+    table_c.append_row(["Ebooks", total_ebook_count])
+
+    table_s = BeautifulTable()
+    table_s.column_headers = ["", "Size"]
+    # table.append_row(["Min", hsize(size_min)])
+    table_s.append_row(["Biggest File", hsize(size_max)])
+    table_s.append_row(["Total",  hsize(total_size)])
+
+    print()
+    print("Summary:")
+    table = BeautifulTable()
+    table.column_headers = ["Total Count", "Total Size"]
+    table.append_row([table_c, table_s])
+    # table.set_style(BeautifulTable.STYLE_MARKDOWN)
+    print(table)
+
+    print()
+
+
+def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
+    # print("Accepted formats", accepted_formats)
+    source=book['source']
+    # print("Formats available in source: {}".format(list(source['formats'].keys())))
+    my_formats=[]
+    for f,v in source['formats'].items():
+        if v['status']=='todo':
+            my_formats.append(f)
+    # print("Formats in 'todo': {}".format(my_formats))
+    
+    formats=[]
+    if single_format:
+        if accepted_formats:
+            for f in accepted_formats:
+                if f in my_formats:
+                    formats=[f]
+                    break
+        else: 
+            print("need at least 1 format for ordering")
+    else:
+        if accepted_formats: 
+            formats=list(set(accepted_formats) & set(my_formats))
+        elif ignored_formats: 
+            formats = list(set(my_formats) - set(ignored_formats))
+        else:
+            formats=my_formats
+
+    # print("Formats expected: {}".format(formats))
+
+    download_formats=formats[:]
+    for f in formats:
+        if not 'size' in source['formats'][f] and max_size:
+            # print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
+            download_formats.remove(f)
+        else:
+            size = source['formats'][f]['size']
+            if size < min_size or (max_size and size > max_size):
+                download_formats.remove(f)
+                # print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
+    return download_formats
+
+
+def update_format_statuses(book,refresh_ignored):
+    formats=book['source']['formats']
+    for f, v in formats.items():
+        if v['status']=='ignored' and not refresh_ignored:
+            # print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
+            pass
+        else:
+            # print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
+            book['source']['formats'][f]['status']='todo'
+
+import glob
+def check_ebooks(dir= 'my_books', dry_run=True):
+    '''
+    Check ebooks:
+    '''
+
+    print("Checking ...")
+    
+    for root, dirs, files in os.walk(dir, topdown=True):
+        for counter, uuid in enumerate(dirs):
+            book = load_metadata(root, uuid)
+            if book:
+                status=book['source']['status']
+                if status=="todo":
+                    print(status)
+                    source=book['source']
+                    update=False
+                    for f, v in source["formats"].items():
+                        print(uuid, f, v['status'])                                    
+                        if v['status']=="todo":
+                            formats= glob.glob(root+"/"+uuid+"/*."+f)
+                            print(formats)
+                            if formats:
+                                print(book['uuid'], formats[0])
+                                book['source']['formats'][f]['status']="done"
+                                update=True
+
+                    if not dry_run and update:
+                        update_done_status(book)
+                        save_metadata(dir, book)
+                        print("Book done", book['uuid'])
+                        print()
+    print()
+
+
+if __name__ == "__main__":
+    fire.Fire({
+        "index_ebooks": index_ebooks,
+        "download_ebooks": download_ebooks,
+        "download_covers": download_covers,
+        "set_status": set_status,
+        "check_ebooks": check_ebooks
+        })