From fcf9e7a1ef1e9dfd83f5846fc7eb2b26ad1cb609 Mon Sep 17 00:00:00 2001 From: Ozzie Isaacs Date: Wed, 17 Mar 2021 19:06:51 +0100 Subject: [PATCH] Upload pdf fixes: Handle no title Handle no author Fix import of more than one language Add missing pdf upload publisher handling --- cps/comic.py | 6 +- cps/constants.py | 2 +- cps/editbooks.py | 11 ++-- cps/epub.py | 3 +- cps/fb2.py | 3 +- cps/isoLanguages.py | 14 +++-- cps/uploader.py | 132 ++++++++++++++++++++++++++++++++++++++------ requirements.txt | 2 +- setup.cfg | 14 ++--- 9 files changed, 148 insertions(+), 39 deletions(-) diff --git a/cps/comic.py b/cps/comic.py index c2b30197..c1f1fd63 100644 --- a/cps/comic.py +++ b/cps/comic.py @@ -154,7 +154,8 @@ def get_comic_info(tmp_file_path, original_file_name, original_file_extension, r tags="", series=loadedMetadata.series or "", series_id=loadedMetadata.issue or "", - languages=loadedMetadata.language) + languages=loadedMetadata.language, + publisher="") return BookMeta( file_path=tmp_file_path, @@ -166,4 +167,5 @@ def get_comic_info(tmp_file_path, original_file_name, original_file_extension, r tags="", series="", series_id="", - languages="") + languages="", + publisher="") diff --git a/cps/constants.py b/cps/constants.py index ac1157ef..2c00d434 100644 --- a/cps/constants.py +++ b/cps/constants.py @@ -130,7 +130,7 @@ def selected_roles(dictionary): # :rtype: BookMeta BookMeta = namedtuple('BookMeta', 'file_path, extension, title, author, cover, description, tags, series, ' - 'series_id, languages') + 'series_id, languages, publisher') STABLE_VERSION = {'version': '0.6.12 Beta'} diff --git a/cps/editbooks.py b/cps/editbooks.py index 42bda734..28cad5c5 100644 --- a/cps/editbooks.py +++ b/cps/editbooks.py @@ -444,10 +444,10 @@ def edit_book_languages(languages, book, upload=False): return modify_database_object(input_l, book.languages, db.Languages, calibre_db.session, 'languages') -def edit_book_publisher(to_save, book): +def edit_book_publisher(publishers, book): changed = False - if to_save["publisher"]: - publisher = to_save["publisher"].rstrip().strip() + if publishers: + publisher = publishers.rstrip().strip() if len(book.publishers) == 0 or (len(book.publishers) > 0 and publisher != book.publishers[0].name): changed |= modify_database_object([publisher], book.publishers, db.Publishers, calibre_db.session, 'publisher') @@ -740,7 +740,7 @@ def edit_book(book_id): book.pubdate = db.Books.DEFAULT_PUBDATE # handle book publisher - modif_date |= edit_book_publisher(to_save, book) + modif_date |= edit_book_publisher(to_save['publisher'], book) # handle book languages modif_date |= edit_book_languages(to_save['languages'], book) @@ -867,6 +867,9 @@ def create_book_on_upload(modif_date, meta): # handle tags modif_date |= edit_book_tags(meta.tags, db_book) + # handle publisher + modif_date |= edit_book_publisher(meta.publisher, db_book) + # handle series modif_date |= edit_book_series(meta.series, db_book) diff --git a/cps/epub.py b/cps/epub.py index 583e4eda..5833c2aa 100644 --- a/cps/epub.py +++ b/cps/epub.py @@ -142,4 +142,5 @@ def get_epub_info(tmp_file_path, original_file_name, original_file_extension): tags=epub_metadata['subject'].encode('utf-8').decode('utf-8'), series=epub_metadata['series'].encode('utf-8').decode('utf-8'), series_id=epub_metadata['series_id'].encode('utf-8').decode('utf-8'), - languages=epub_metadata['language']) + languages=epub_metadata['language'], + publisher="") diff --git a/cps/fb2.py b/cps/fb2.py index bdb3d1d5..59df19ba 100644 --- a/cps/fb2.py +++ b/cps/fb2.py @@ -77,4 +77,5 @@ def get_fb2_info(tmp_file_path, original_file_extension): tags="", series="", series_id="", - languages="") + languages="", + publisher="") diff --git a/cps/isoLanguages.py b/cps/isoLanguages.py index 896d4faf..08bdf956 100644 --- a/cps/isoLanguages.py +++ b/cps/isoLanguages.py @@ -57,27 +57,29 @@ def get_language_name(locale, lang_code): def get_language_codes(locale, language_names, remainder=None): language_names = set(x.strip().lower() for x in language_names if x) - languages = list() + lang = list() for k, v in get_language_names(locale).items(): v = v.lower() if v in language_names: - languages.append(k) + lang.append(k) language_names.remove(v) if remainder is not None: remainder.extend(language_names) - return languages + return lang + def get_valid_language_codes(locale, language_names, remainder=None): - languages = list() + lang = list() if "" in language_names: language_names.remove("") for k, __ in get_language_names(locale).items(): if k in language_names: - languages.append(k) + lang.append(k) language_names.remove(k) if remainder is not None and len(language_names): remainder.extend(language_names) - return languages + return lang + def get_lang3(lang): try: diff --git a/cps/uploader.py b/cps/uploader.py index 5fd445c6..b86ce4dd 100644 --- a/cps/uploader.py +++ b/cps/uploader.py @@ -44,12 +44,17 @@ except (ImportError, RuntimeError) as e: use_generic_pdf_cover = True try: - from PyPDF2 import PdfFileReader - from PyPDF2 import __version__ as PyPdfVersion + from PyPDF3 import PdfFileReader + from PyPDF3 import __version__ as PyPdfVersion use_pdf_meta = True -except ImportError as e: - log.debug('Cannot import PyPDF2, extracting pdf metadata will not work: %s', e) - use_pdf_meta = False +except ImportError as ex: + try: + from PyPDF2 import PdfFileReader + from PyPDF2 import __version__ as PyPdfVersion + use_pdf_meta = True + except ImportError as e: + log.debug('Cannot import PyPDF3/PyPDF2, extracting pdf metadata will not work: %s / %s', e) + use_pdf_meta = False try: from . import epub @@ -102,7 +107,98 @@ def default_meta(tmp_file_path, original_file_name, original_file_extension): tags="", series="", series_id="", - languages="") + languages="", + publisher="") + + +def parse_xmp(pdf_file): + """ + Parse XMP Metadata and prepare for BookMeta object + """ + try: + xmp_info = pdf_file.getXmpMetadata() + except Exception as e: + log.debug('Can not read XMP metadata', e) + return None + + if xmp_info: + try: + xmp_author = xmp_info.dc_creator # list + except AttributeError: + xmp_author = [''] + + if xmp_info.dc_title: + xmp_title = xmp_info.dc_title['x-default'] + else: + xmp_title = '' + + if xmp_info.dc_description: + xmp_description = xmp_info.dc_description['x-default'] + else: + xmp_description = '' + + languages = [] + try: + for i in xmp_info.dc_language: + #calibre-web currently only takes one language. + languages.append(isoLanguages.get_lang3(i)) + except: + languages.append('') + + xmp_tags = ', '.join(xmp_info.dc_subject) + xmp_publisher = ', '.join(xmp_info.dc_publisher) + + return {'author': xmp_author, + 'title': xmp_title, + 'subject': xmp_description, + 'tags': xmp_tags, 'languages': languages, + 'publisher': xmp_publisher + } + + +def parse_xmp(pdf_file): + """ + Parse XMP Metadata and prepare for BookMeta object + """ + try: + xmp_info = pdf_file.getXmpMetadata() + except Exception as e: + log.debug('Can not read XMP metadata', e) + return None + + if xmp_info: + try: + xmp_author = xmp_info.dc_creator # list + except: + xmp_author = [''] + + if xmp_info.dc_title: + xmp_title = xmp_info.dc_title['x-default'] + else: + xmp_title = '' + + if xmp_info.dc_description: + xmp_description = xmp_info.dc_description['x-default'] + else: + xmp_description = '' + + languages = [] + try: + for i in xmp_info.dc_language: + languages.append(isoLanguages.get_lang3(i)) + except AttributeError: + languages= [""] + + xmp_tags = ', '.join(xmp_info.dc_subject) + xmp_publisher = ', '.join(xmp_info.dc_publisher) + + return {'author': xmp_author, + 'title': xmp_title, + 'subject': xmp_description, + 'tags': xmp_tags, + 'languages': languages, + 'publisher': xmp_publisher + } def parse_xmp(pdf_file): @@ -154,6 +250,8 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): if use_pdf_meta: with open(tmp_file_path, 'rb') as f: + languages = [""] + publisher = "" pdf_file = PdfFileReader(f) doc_info = pdf_file.getDocumentInfo() xmp_info = parse_xmp(pdf_file) @@ -166,20 +264,22 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): languages = xmp_info['languages'] publisher = xmp_info['publisher'] - elif doc_info: - author = ' & '.join(split_authors([doc_info.author])) - title = doc_info.title - subject = doc_info.subject - tags = doc_info['/Keywords'] - languages = "" - publisher = "" + if doc_info: + if author == '': + author = ' & '.join(split_authors([doc_info.author])) if doc_info.author else u'Unknown' + if title == '': + title = doc_info.title if doc_info.title else original_file_name + if subject == '': + subject = doc_info.subject + if tags == '' and '/Keywords' in doc_info: + tags = doc_info['/Keywords'] else: author= u'Unknown' title = original_file_name subject = "" tags = "" - languages = "" + languages = [""] publisher = "" return BookMeta( @@ -192,8 +292,8 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): tags=tags, series="", series_id="", - languages=', '.join(languages) - ) + languages=','.join(languages), + publisher=publisher) def pdf_preview(tmp_file_path, tmp_dir): diff --git a/requirements.txt b/requirements.txt index 94dc7f3a..04aaa000 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ singledispatch>=3.4.0.0,<3.5.0.0 backports_abc>=0.4 Flask>=1.0.2,<1.2.0 iso-639>=0.4.5,<0.5.0 -PyPDF2>=1.26.0,<1.27.0 +PyPDF3>=1.0.0,<1.0.4 pytz>=2016.10 requests>=2.11.1,<2.25.0 SQLAlchemy>=1.3.0,<1.4.0 diff --git a/setup.cfg b/setup.cfg index 88624195..89e0f598 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,7 @@ install_requires = backports_abc>=0.4 Flask>=1.0.2,<1.2.0 iso-639>=0.4.5,<0.5.0 - PyPDF2>=1.26.0,<1.27.0 + PyPDF3>=1.0.0,<1.0.4 pytz>=2016.10 requests>=2.11.1,<2.25.0 SQLAlchemy>=1.3.0,<1.4.0 @@ -52,9 +52,9 @@ install_requires = [options.extras_require] gdrive = - google-api-python-client>=1.7.11,<1.8.0 - gevent>=1.2.1,<20.6.0 - greenlet>=0.4.12,<0.4.17 + google-api-python-client>=1.7.11,<1.13.0 + gevent>20.6.0,<21.2.0 + greenlet>=0.4.17,<1.1.0 httplib2>=0.9.2,<0.18.0 oauth2client>=4.0.0,<4.1.4 uritemplate>=3.0.0,<3.1.0 @@ -68,16 +68,16 @@ goodreads = goodreads>=0.3.2,<0.4.0 python-Levenshtein>=0.12.0,<0.13.0 ldap = - python-ldap>=3.0.0,<3.3.0 + python-ldap>=3.0.0,<3.4.0 Flask-SimpleLDAP>=1.4.0,<1.5.0 oauth = Flask-Dance>=1.4.0,<3.1.0 SQLAlchemy-Utils>=0.33.5,<0.37.0 metadata = - lxml>=3.8.0,<4.6.0 + lxml>=3.8.0,<4.7.0 rarfile>=2.7 comics = - natsort>=2.2.0 + natsort>=2.2.0,<7.1.0 comicapi>= 2.1.3,<2.2.0 kobo = jsonschema>=3.2.0,<3.3.0