First public version

master
Krazybug 2 years ago
commit 3848c62530

5
.gitignore vendored

@ -0,0 +1,5 @@
*.pyc
calishot.egg-info/
.vscode
.DS_Store
output/

@ -0,0 +1,108 @@
# CALISHOT Guidelines
## Installation
You need poetry pre installed.
Clone the repository then :
```
poetry install
poetry shell
mkdir output
cd output
```
Then create a list.txt file with all your calibre urls
## Indexing
```
python ../calishot import list.txt
python ../calishot check
sqlite-utils sites.db 'select url from sites where status="online" ' | jq -r '.[].url' > online.txt
python ../calishot index-site-list online.txt
python ../calishot build-index --english
mv index.db index-eng.db
python ../calishot build-index --noenglish
mv index.db index-non-eng.db
# for diplaying global size and total count of formats
python ../calishot get-stats
python ../calishot index-to-json | jq -r '. | {title: .title.label, authors, year, language, publisher, series, desc: .title.href, tags, identifiers, formats, format_links: [.links[].href]}' > calibre.json
sqlite-utils index.db 'select uuid, title, authors, year, series, language, formats, publisher, tags, identifiers from summary where instr(formats, "mp3") >0 order by uuid limit 101'
```
## Deployment
1. Install poetry, datasette and it's plugins
```
poetry new calishot
poetry shell
poetry add datasette
poetry add datasette-json-html
poetry add datasette-pretty-json
```
You can eventually install it with virtualenv/pip if you don't want to use poetry:
```
python -m venv calishot
. ./calishot/bin/activate
pip install datasette
pip install datasette-json-html
pip install datasette-pretty-json
````
2. Prepare the calishot settings:
Download the sqlite db file to the same directory and then
```
cat <<EOF > metadata.json
{
"databases": {
"index": {
"tables": {
"summary": {
"sort": "title",
"searchmode": "raw"
}
}
}
}
}
EOF
```
You can now run a local test:
```
datasette serve index-non-eng.db --config sql_time_limit_ms:50000 --config allow_download:off --config max_returned_rows:2000 --config num_sql_threads:10 --config allow_csv_stream:off --metadata metadata.json
```
Open your browser to http://localhost:8001/ and check the result.
3. Now you're ready to publish :)
Install [heroku-cli](https://devcenter.heroku.com/articles/heroku-cli) then :
export NODE_EXTRA_CA_CERTS=<your_dir>/calishot/CAall.cer
```
heroku login -i
datasette publish heroku index-non-eng.db -n calishot-non-eng-1 --install=datasette-json-html --install=datasette-pretty-json --extra-options="--config sql_time_limit_ms:50000 --config allow_download:off --config num_sql_threads:10 --config max_returned_rows:500 --config allow_csv_stream:off" --metadata metadata.json
```

@ -0,0 +1 @@
__version__ = '0.1.0'

@ -0,0 +1,19 @@
import fire
from site_index import import_urls_from_file, check_calibre_list, check_calibre_site
from calistat import index_site_list, get_stats, index_site_list_seq
from ebooks_index import build_index, index_to_json
from diff import diff
if __name__ == "__main__":
fire.Fire({
"import": import_urls_from_file,
"check":check_calibre_list,
"check-site":check_calibre_site,
"index-site-list": index_site_list,
"index-site-list-seq": index_site_list_seq,
"build-index": build_index,
"get-stats": get_stats,
"index-to-json": index_to_json,
"diff": diff
})

@ -0,0 +1,514 @@
import sys
import os
import time
import re
import shutil
from typing import Dict
import requests
import json
from humanize import naturalsize as hsize
import humanize
from langid.langid import LanguageIdentifier, model
import iso639
import time
import json
import unidecode
from requests.adapters import HTTPAdapter
import urllib.parse
import urllib3
from pathlib import Path
import uuid
from sqlite_utils import Database
import gevent
from gevent import monkey
from gevent import Timeout
from gevent.pool import Pool
monkey.patch_socket()
# monkey.patch_all()
import fire
from site_index import init_sites_db, get_libs_from_site
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
def get_site_db(uuid, dir):
f_uuid=str(uuid)+".db"
print(f_uuid)
path = Path(dir) / str(f_uuid)
return Database(path)
def init_site_db(site, _uuid="", dir="."):
if not _uuid:
s_uuid=str(uuid.uuid4())
else:
s_uuid=str(_uuid)
f_uuid=s_uuid+".db"
path = Path(dir) / f_uuid
db = Database(path)
if not "site" in db.table_names():
s=db["site"]
s.insert(
{
"uuid": s_uuid,
"urls": [site],
"version": "",
"major": 0,
"schema_version": 1,
}
, pk='uuid'
)
if not "ebooks" in db.table_names():
db["ebooks"].create({
"uuid": str,
"id": int,
"library": str, #TODO: manage libraries ids as integer to prevent library renam on remote site
"title": str,
"authors": str,
"series": str,
"series_index": int,
# "edition": int,
"language": str,
"desc": str,
"identifiers": str,
"tags": str,
"publisher": str,
"pubdate": str,
"last_modified": str,
"timestamp": str,
"formats": str,
"cover": int,
# "epub": int,
# "mobi": int,
# "pdf": int,
# TODO: add the most common formats to avoid alter tables
}, pk="uuid")
if not "libraries" in db.table_names():
db["libraries"].create({
"id": int,
"names": str
}, pk="id")
# db.table("ebooks", pk="id")
# db.table("ebooks", pk="id", alter=True
return db
def get_format_url(db, book, format):
url = json.loads(list(db['site'].rows)[0]["urls"])[0]
library=book['library']
id_=str(book['id'])
f_url = url+"/get/"+format+"/"+id_+"/"+library
return f_url
def get_desc_url(db, book):
url = json.loads(list(db['site'].rows)[0]["urls"])[0]
library=book['library']
id_=str(book['id'])
f_urls=[]
major= list(db['site'].rows)[0]["major"]
if major >= 3:
d_url =url+"#book_id="+id_+"&library_id="+library+"&panel=book_details"
else:
d_url =url+"/browse/book/"+id_
return d_url
def save_books_metadata_from_site(db, books):
uuid = list(db['site'].rows)[0]["uuid"]
# print(uuid)
ebooks_t=db["ebooks"]
# print([c[1] for c in ebooks_t.columns])
# for b in books:
# print(b['title'])
# ebooks_t.insert(b, alter=True)
# ebooks_t.insert_all(books, alter=True)
ebooks_t.insert_all(books, alter=True, pk='uuid', batch_size=1000)
# print([c[1] for c in ebooks_t.columns])
def load_metadata(dir, uuid):
pass
def update_done_status(book):
source=book['source']
if source['status']!='ignored':
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
book['source']['status']="done"
else:
book['source']['status']="todo"
def index_site_list_seq(file):
with open(file) as f:
for s in f.readlines():
# try:
# index_ebooks(s.rstrip())
# except:
# continue
index_ebooks(s.rstrip())
def index_site_list(file):
pool = Pool(40)
with open(file) as f:
sites = f.readlines()
sites= [s.rstrip() for s in sites]
print(sites)
pool.map(index_ebooks_except, sites)
def index_ebooks_except(site):
try:
index_ebooks(site)
except:
print("Error on site")
def index_ebooks(site, library="", start=0, stop=0, dir=".", num=1000, force_refresh=False):
#TODO old calibres don't manage libraries. /ajax/library-info endpoint doesn't exist. It would be better to manage calibre version directly
libs=[]
try:
libs= get_libs_from_site(site)
except:
print("old lib")
_uuid=str(uuid.uuid4())
if libs:
for lib in libs:
index_ebooks_from_library(site=site, _uuid=_uuid, library=lib, start=start, stop=stop, dir=dir, num=num, force_refresh=force_refresh)
else:
index_ebooks_from_library(site=site, _uuid=_uuid, start=start, stop=stop, dir=dir, num=num, force_refresh=force_refresh)
def index_ebooks_from_library(site, _uuid="", library="", start=0, stop=0, dir=".", num=1000, force_refresh=False):
offset= 0 if not start else start-1
num=min(1000, num)
server=site.rstrip('/')
api=server+'/ajax/'
lib=library
library= '/'+library if library else library
timeout=15
print(f"\nIndexing library: {lib} from server: {server} ")
url=api+'search'+library+'?num=0'
print(f"\nGetting ebooks count of library: {lib} from server:{server} ")
# print(url)
try:
r=requests.get(url, verify=False, timeout=(timeout, 30))
r.raise_for_status()
except requests.RequestException as e:
print("Unable to open site:", url)
return
# pass
except Exception as e:
print ("Other issue:", e)
return
# pass
except :
print("Wazza !!!!")
sys.exit(1)
total_num=int(r.json()["total_num"])
total_num= total_num if not stop else stop
print()
print(f"Total count={total_num} from {server}")
# library=r.json()["base_url"].split('/')[-1]
# base_url=r.json()["base_url"]
# cache_db=init_cache_db(dir=dir)
# _uuid=get_uuid_from_url(cache_db)
db=init_site_db(site, _uuid=_uuid, dir=dir)
r_site = (list(db['site'].rows)[0])
r_site['version']=r.headers['server']
r_site['major']=int(re.search('calibre.(\d).*', r.headers['server']).group(1))
db["site"].upsert(r_site, pk='uuid')
print()
range=offset+1
while offset < total_num:
remaining_num = min(num, total_num - offset)
# print()
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
print ('\r {:180.180}'.format(f'Downloading ids: offset={str(offset)} count={str(remaining_num)} from {server}'), end='')
# url=server+base_url+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
# print("->", url)
try:
r=requests.get(url, verify=False, timeout=(timeout, 30))
r.raise_for_status()
except requests.RequestException as e:
print ("Connection issue:", e)
return
# pass
except Exception as e:
print ("Other issue:", e)
return
# pass
except :
print ("Wazza !!!!")
return
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
# print()
# print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num))
print ('\r {:180.180}'.format(f'Downloading metadata from {str(offset+1)} to {str(offset+remaining_num)}/{total_num} from {server}'), end='')
books_s=",".join(str(i) for i in r.json()['book_ids'])
url=api+'books'+library+'?ids='+books_s
# url=server+base_url+'/books?ids='+books_s
# print("->", url)
# print ('\r{:190.190}'.format(f'url= {url} ...'), end='')
try:
r=requests.get(url, verify=False, timeout=(60, 60))
r.raise_for_status()
except requests.RequestException as e:
print ("Connection issue:", e)
return
# pass
except Exception as e:
print ("Other issue:", e)
return
# pass
except :
print ("Wazza !!!!")
return
# print(len(r.json()), "received")
print ('\r {:180.180}'.format(f'{len(r.json())} received'), end='')
books=[]
for id, r_book in r.json().items():
uuid=r_book['uuid']
if not uuid:
print ("No uuid for ebook: ignored")
continue
if r_book['authors']:
desc= f"({r_book['title']} / {r_book['authors'][0]})"
else:
desc= f"({r_book['title']})"
# print (f'\r--> {range}/{total_num} - {desc}', end='')
# print (f'\r{server}--> {range}/{total_num} - {desc}', end='')
print ('\r {:180.180} '.format(f'{range}/{total_num} ({server} : {uuid} --> {desc}'), end='')
if not force_refresh:
# print("Checking local metadata:", uuid)
try:
book = load_metadata(dir, uuid)
except:
print("Unable to get metadata from:", uuid)
range+=1
continue
if book:
print("Metadata already present for:", uuid)
range+=1
continue
if not r_book['formats']:
# print("No format found for {}".format(r_book['uuid']))
range+=1
continue
book={}
book['uuid']=r_book['uuid']
book['id']=id
book['library']=lib
# book['title']=r_book['title']
book['title']=unidecode.unidecode(r_book['title'])
# book['authors']=r_book['authors']
if r_book['authors']:
book['authors']=[unidecode.unidecode(s) for s in r_book['authors']]
# book['desc']=""
book['desc']=r_book['comments']
if r_book['series']:
book['series']=unidecode.unidecode(r_book['series'])
# book['series']=[unidecode.unidecode(s) for s in r_book['series']]
s_i=r_book['series_index']
if (s_i):
book['series_index']=int(s_i)
# book['edition']=0
book['identifiers']=r_book['identifiers']
# book['tags']=r_book['tags']
if r_book['tags']:
book['tags']=[unidecode.unidecode(s) for s in r_book['tags']]
book['publisher']=r_book['publisher']
# book['publisher']=unidecode.unidecode(r_book['publisher'])
book['pubdate']=r_book['pubdate']
if not r_book['languages']:
# if True:
text=r_book['title']+". "
if r_book['comments']:
text=r_book['comments']
s_language, prob=identifier.classify(text)
if prob >= 0.85:
language = iso639.to_iso639_2(s_language)
book['language']=language
else:
book['language']=''
else:
book['language']=iso639.to_iso639_2(r_book['languages'][0])
if r_book['cover']:
book['cover']= True
else:
book['cover']= False
book['last_modified']=r_book['last_modified']
book['timestamp']=r_book['timestamp']
book['formats']=[]
formats=r_book['formats']
for f in formats:
if 'size' in r_book['format_metadata'][f]:
size=int(r_book['format_metadata'][f]['size'])
else:
# print()
# print(f"Size not found for format '{f}' uuid={uuid}: skipped")
pass
#TODO query the size when the function to rebuild the full url is ready
#
# print("Trying to get size online: {}".format('url'))
# try:
# size=get_file_size(s['url'])
# except:
# print("Unable to access size for format '{}' : {} skipped".format(f, uuid))
# continue
book[f]=(size)
book['formats'].append(f)
if not book['formats']:
# if not c_format:
# print()
# print(f"No format found for {book['uuid']} id={book['id']} : skipped")
range+=1
# continue
books.append(book)
range+=1
# print()
print("Saving metadata")
print ('\r {:180.180}'.format(f'Saving metadata from {server}'), end='')
try:
save_books_metadata_from_site(db, books)
print('\r {:180.180}'.format(f'--> Saved {range-1}/{total_num} ebooks from {server}'), end='')
except BaseException as err:
print (err)
print()
print()
# try:
# save_metadata(db, books)
# except:
# print("Unable to save book metadata")
offset=offset+num
def query(query_str="", dir="."):
dbs=[]
for path in os.listdir(dir):
db = Database(path)
# print (db["ebooks"].count)
# for row in db["site"].rows:
# print (f'{row["urls"]}: {db["ebooks"].count}')
# db["ebooks"].search(query_str)
# url=db['site'].get(1)['urls'][0]
url=db['site'].get(1)
print (url)
for ebook in db["ebooks"].rows_where(query_str):
# print (f"{ebook['title']} ({ebook['uuid']})")
print (ebook)
def get_stats(dir="."):
dbs=[]
size=0
count=0
for f in os.listdir(dir):
if not f.endswith(".db"):
continue
if f == "index.db":
continue
path = Path(dir) / f
dbs.append(Database(path))
for db in dbs:
for i, ebook in enumerate(db["ebooks"].rows):
uuid=ebook['uuid']
title=ebook['title']
formats=json.loads(ebook['formats'])
# print(formats)
for f in formats:
if f in ebook:
if ebook[f]:
size+=ebook[f]
count+=1
# print (f'\r{count} {f} --> {uuid}: {title}', end ='')
# print (f'\r{count} : {uuid} --> {f}', end='')
print (f'\r{count} formats - ebook : {uuid}', end='')
print()
print("Total count of formats:", humanize.intcomma(count))
print("Total size:", hsize(size))
print()
if __name__ == "__main__":
fire.Fire()

@ -0,0 +1,64 @@
from pathlib import Path
from sqlite_utils import Database
from sqlite_utils.db import NotFoundError
import json
def init_diff_db(dir="."):
path = Path(dir) / "diff.db"
db_diff = Database(path)
if not "summary" in db_diff.table_names():
db_diff["summary"].create({
"uuid": str,
"title": str,
# "cover": str,
# "source": str
"authors": str,
"year": str,
"series": str,
"language": str,
"links": str,
# "desc": str,
"publisher": str,
"tags": str,
"identifiers": str,
"formats": str,
"status": str,
"old_location":str
}
# )
, pk="uuid")
return db_diff
def diff(old, new, dir=".", ):
path = Path(dir) / old
db_old = Database(path)
path = Path(dir) / new
db_new = Database(path)
path = Path(dir) / "diff.db"
db_diff =init_diff_db(dir)
for i, n_book in enumerate(db_new["summary"].rows):
n_uuid = n_book['uuid']
print(i, n_uuid)
try:
o_book = db_old["summary"].get(n_uuid)
# print(n_uuid, '=OK')
o_loc=json.loads(o_book['title'])['href']
n_loc=json.loads(n_book['title'])['href']
if o_loc != n_loc :
print(n_uuid, 'MOVED')
n_book["status"]="MOVED"
n_book["old_location"]=o_loc
n_book.pop ('cover', None)
db_diff["summary"].insert(n_book, pk='uuid')
except NotFoundError:
# print(n_uuid, '=NOK')
n_book.pop ('cover', None)
n_book["status"]="NEW"
db_diff["summary"].insert(n_book, pk='uuid')

@ -0,0 +1,237 @@
import os
import sys
import json
from pathlib import Path
from sqlite_utils import Database
from humanize import naturalsize as hsize
from calistat import get_desc_url, get_format_url
def init_index_db(dir="."):
path = Path(dir) / "index.db"
db_index = Database(path)
if not "summary" in db_index.table_names():
db_index["summary"].create({
"uuid": str,
"cover": str,
"title": str,
# "source": str
"authors": str,
"year": str,
"series": str,
"language": str,
"links": str,
# "desc": str,
"publisher": str,
"tags": str,
"identifiers": str,
"formats": str
}
# )
, pk="uuid")
# db_index.table("index", pk="uuid")
# db_index.table("summary").enable_fts(["title"])
# db_index["summary"].enable_fts(["title", "authors", "series", "uuid", "language", "identifiers", "tags", "publisher", "formats", "pubdate"])
db_index["summary"].enable_fts(["title", "authors", "series", "language", "identifiers", "tags", "publisher", "formats", "year"])
return db_index
def get_img_url(db, book):
url = json.loads(list(db['site'].rows)[0]["urls"])[0]
library=book['library']
id_=str(book['id'])
f_urls=[]
major= list(db['site'].rows)[0]["major"]
if major >= 3:
d_url =url+"/get/thumb/"+id_+"/"+library+ "?sz=600x800"
else:
# d_url =url+"/get/thumb/"+id_
d_url =url+"/get/thumb_90_120/"+id_
return d_url
def build_index (dir='.', english=True):
dbs=[]
for f in os.listdir(dir):
if not f.endswith(".db"):
continue
if f in ("index.db", "sites.db"):
continue
p = Path(dir) / f
print(f)
try:
db = Database(p.resolve())
except:
print ("Pb with:", f)
dbs.append(db)
db_index = init_index_db(dir=dir)
index_t=db_index["summary"]
batch_size=10000
count=0
summaries=[]
for db in dbs:
for i, ebook in enumerate(db["ebooks"].rows):
if english and not ebook['language'] or ebook['language'] != "eng":
continue
elif not english and ebook['language'] == "eng":
continue
if ebook['authors']:
ebook['authors']=formats=json.loads(ebook['authors'])
# if ebook['series']:
# ebook['series']=formats=json.loads(ebook['series'])
if ebook['identifiers']:
ebook['identifiers']=formats=json.loads(ebook['identifiers'])
if ebook['tags']:
ebook['tags']=formats=json.loads(ebook['tags'])
ebook['formats']=formats=json.loads(ebook['formats'])
ebook['links']=""
summary = {k: v for k, v in ebook.items() if k in ("uuid","title", "authors", "series", "language", "formats", "tags", "publisher", "identifiers")}
# summary = {k: v for k, v in ebook.items() if k in ("uuid","title", "authors", "series", "identifiers", "language", "tags", "publisher", "formats")}
summary['title']={'href': get_desc_url(db, ebook), 'label': ebook['title']}
summary["cover"]= {"img_src": get_img_url(db, ebook), "width": 90}
formats=[]
for f in ebook['formats']:
formats.append({'href': get_format_url(db, ebook, f), 'label': f"{f} ({hsize(ebook[f])})"})
summary['links']=formats
pubdate=ebook['pubdate']
summary['year']=pubdate[0:4] if pubdate else ""
summaries.append(summary)
# print(summary)
count+=1
print (f"\r{count} - ebook handled: {ebook['uuid']}", end='')
if not count % batch_size:
# print()
# print(f"Saving summary by batch: {len(summaries)}")
# print(summaries)
# index_t.upsert_all(summaries, batch_size=1000, pk='uuid')
# index_t.insert_all(summaries, batch_size=1000, pk='uuid')
try:
index_t.insert_all(summaries, batch_size=batch_size)
except Exception as e:
# dump = [(s['uuid'],s['links']) for s in summaries]
# print(dump)
print()
print("UUID collisions. Probalbly a site duplicate")
print(e)
print()
# index_t.upsert_all(summaries, batch_size=batch_size, pk='uuid')
# TODO Some ebooks could be missed. We need to compute the batch list, insert new ebooks and update the site index
# print("Saved")
# print()
summaries=[]
# print()
# print("saving summary")
# index_t.upsert_all(summaries, batch_size=1000, pk='uuid')
# index_t.insert_all(summaries, batch_size=1000, pk='uuid')
try:
index_t.insert_all(summaries, batch_size=batch_size)
except:
print("sqlite3.IntegrityError: UNIQUE constraint failed: summary.uuid")
# print("summary done")
# print()
print()
print("fts")
index_t.populate_fts(["title", "authors", "series", "identifiers", "language", "tags", "publisher", "formats", "year"])
print("fts done")
def search(query_str, dir=".", links_only=False):
path = Path(dir) / "index.db"
db_index = Database(path)
# table=db_index["summary"]
# rows=table.search(query_str)
# print(rows)
sites=set()
ebook_ids=[]
for ebook in db_index["summary"].search(query_str):
sites.add(ebook[-1])
ebook_ids.append((ebook[3], ebook[-1]))
# print (ebook)
# print("sites:", sites)
# print("ebooks:", ebook_ids)
site_dbs={}
for s in sites:
f_uuid=s+".db"
path = Path(dir) / f_uuid
site_dbs[s]=Database(path)
# print(site_dbs[s].tables)
for e in ebook_ids:
# ebook=site_dbs[e[1]]["ebooks"].get(e[0])
# print("ebook:", ebook)
db=site_dbs[e[1]]
# ebooks=db.conn.execute("select * from ebooks").fetchone()
ebook=db.conn.execute(f'select * from ebooks where uuid="{e[0]}"').fetchone()
url=json.loads(db['site'].get(1)['urls'])[0]
library=db['site'].get(1)['library']
formats=json.loads(ebook[14])
id_=str(ebook[0])
if not links_only:
print()
print("Title:", ebook[2])
print("Author:", ebook[3])
print("Serie:", ebook[4])
print("Formats:", formats)
for f in formats:
print(url+"get/"+f+"/"+id_+"/"+library)
# https://stackoverflow.com/questions/26692284/how-to-prevent-brokenpipeerror-when-doing-a-flush-in-python
def index_to_json(dir='.'):
path = Path(dir) / "index.db"
db = Database(path)
# sys.stdout.flush()
try:
for row in db["summary"].rows:
if row['title']:
row['title']=json.loads(row['title'])
if row['authors']:
row['authors']=json.loads(row['authors'])
if row['series']:
row['series']=json.loads(row['series'])
if row['links']:
row['links']=json.loads(row['links'])
if row['tags']:
row['tags']=json.loads(row['tags'])
if row['identifiers']:
row['identifiers']=json.loads(row['identifiers'])
if row['formats']:
row['formats']=json.loads(row['formats'])
json.dump(row, sys.stdout)
sys.stdout.flush()
# return
except BrokenPipeError:
devnull = os.open(os.devnull, os.O_WRONLY)
os.dup2(devnull, sys.stdout.fileno())
sys.exit(1)

@ -0,0 +1,206 @@
import requests
from pathlib import Path
from urllib.parse import *
import uuid
from sqlite_utils import Database
import datetime
import gevent
from gevent import monkey
from gevent import Timeout
from gevent.pool import Pool
monkey.patch_socket()
def init_sites_db(dir="."):
path = Path(dir) / "sites.db"
db = Database(path)
if not "sites" in db.table_names():
db["sites"].create({
"uuid": str,
"url": str,
"hostnames": str,
"ports": str,
"country": int,
"isp": str,
"status": str,
"last_online": str,
"last_check": str,
"error": int,
# "schema_version": 1
# # TODO: add the most common formats
}, pk="uuid")
# }, pk="uuid", not_null=True)
# if not "sites" in db.table_names():
# db["sites"].create({
# "uuid": str
# }, pk="uuid",)
db.table("sites", pk='uuid', batch_size=100, alter=True)
return db
def save_site(db: Database, site):
# # TODO: Check if the site is not alreday present
# def save_sites(db, sites):
# db["sites"].insert_all(sites, alter=True, batch_size=100)
if not 'uuid' in site:
site['uuid']=str(uuid.uuid4())
print(site)
db["sites"].upsert(site, pk='uuid')
def check_and_save_site(db, site):
res= check_calibre_site(site)
print(res)
save_site(db, res)
# import pysnooper
# @pysnooper.snoop()
def check_calibre_site(site):
ret={}
ret['uuid']=site["uuid"]
now=str(datetime.datetime.now())
ret['last_check']=now
api=site['url']+'/ajax/'
timeout=15
library=""
url=api+'search'+library+'?num=0'
print()
print("Getting ebooks count:", site['url'])
print(url)
try:
r=requests.get(url, verify=False, timeout=(timeout, 30))
r.raise_for_status()
except requests.exceptions.HTTPError as e:
r.status_code
ret['error']=r.status_code
if (r.status_code == 401):
ret['status']="unauthorized"
else:
ret['status']="down"
return ret
except requests.RequestException as e:
print("Unable to open site:", url)
# print (getattr(e, 'message', repr(e)))
print (e)
ret['status']="down"
return ret
except Exception as e:
print ("Other issue:", e)
ret['status']='Unknown Error'
print (e)
return ret
except :
print("Wazza !!!!")
ret['status']='Critical Error'
print (e)
return ret
try:
print("Total count=",r.json()["total_num"])
except:
pass
status=ret['status']='online'
if status=="online":
ret['last_online']=now
return ret
def get_site_uuid_from_url(db, url):
site=urlparse(url)
hostname=site.hostname
site=site._replace(path='')
url=urlunparse(site)
# print (url)
# print (hostname)
row=db.conn.execute(f"select * from sites where instr(hostnames, '{hostname}')").fetchone()
# print(row)
if row:
return row
def map_site_from_url(url):
ret={}
site=urlparse(url)
print(site)
site=site._replace(path='')
ret['url']=urlunparse(site)
ret['hostnames']=[site.hostname]
ret['ports']=[str(site.port)]
return ret
def import_urls_from_file(filepath, dir='.'):
#TODO skip malformed urls
#TODO use cache instead
db=init_sites_db(dir)
with open(filepath) as f:
for url in f.readlines():
url=url.rstrip()
# url='http://'+url
if get_site_uuid_from_url(db, url):
print(f"'{url}'' already present")
continue
print(f"'{url}'' added")
save_site(db, map_site_from_url(url))
def get_libs_from_site(site):
server=site.rstrip('/')
api=server+'/ajax/'
timeout=30
print()
print("Server:", server)
url=api+'library-info'
print()
print("Getting libraries from", server)
# print(url)
try:
r=requests.get(url, verify=False, timeout=(timeout, 30))
r.raise_for_status()
except requests.RequestException as e:
print("Unable to open site:", url)
# return
except Exception as e:
print ("Other issue:", e)
return
# pass
libraries = r.json()["library_map"].keys()
print("Libraries:", ", ".join(libraries))
return libraries
def check_calibre_list(dir='.'):
db=init_sites_db(dir)
sites=[]
for row in db["sites"].rows:
print(f"Queueing:{row['url']}")
sites.append(row)
print(sites)
pool = Pool(100)
pool.map(lambda s: check_and_save_site (db, s), sites)
# example of a fts search sqlite-utils index.db "select * from summary_fts where summary_fts match 'title:fre*'"

1160
poetry.lock generated

File diff suppressed because it is too large Load Diff

@ -0,0 +1,29 @@
[tool.poetry]
name = "calishot"
version = "0.1.0"
description = ""
authors = ["Your Name <you@example.com>"]
[tool.poetry.dependencies]
python = "^3.8"
sqlite-utils = "^2.8"
bs4 = "^0.0.1"
gevent = "^20.5.0"
datasette-pretty-json = "^0.2"
datasette-json-html = "^0.6"
datasette-mask-columns = "^0.2"
requests = "^2.24.0"
humanize = "^2.5.0"
langid = "^1.1.6"
iso639 = "^0.1.4"
unidecode = "^1.1.1"
datasette = "^0.50.2"
sqlitedict = "^1.7.0"
fire = "^0.3.1"
[tool.poetry.dev-dependencies]
pytest = "^5.2"
[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"
Loading…
Cancel
Save