From cb7727900c4e96b90480bc70ea71d5042401aac0 Mon Sep 17 00:00:00 2001 From: dickreckard Date: Mon, 7 Sep 2020 13:30:03 +0200 Subject: [PATCH 1/5] Update uploader.py default cover changed from none to pdf_preview when metadata parsing fails --- cps/uploader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/uploader.py b/cps/uploader.py index 2cb982b9..863d094d 100644 --- a/cps/uploader.py +++ b/cps/uploader.py @@ -106,7 +106,7 @@ def default_meta(tmp_file_path, original_file_name, original_file_extension): extension=original_file_extension, title=original_file_name, author=_(u'Unknown'), - cover=None, + cover=pdf_preview(tmp_file_path, original_file_name), description="", tags="", series="", From 23fe79c6182b7f08eb8769ed7c8c1203c61ef63a Mon Sep 17 00:00:00 2001 From: dickreckard Date: Fri, 11 Sep 2020 03:42:19 +0200 Subject: [PATCH 2/5] Update uploader.py --- cps/uploader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/uploader.py b/cps/uploader.py index 863d094d..dd37bf62 100644 --- a/cps/uploader.py +++ b/cps/uploader.py @@ -135,7 +135,7 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): author=' & '.join(split_authors([author])), cover=pdf_preview(tmp_file_path, original_file_name), description=subject, - tags="", + tags=doc_info['/Keywords'], series="", series_id="", languages="") From 22466d6b98fabf0046245c68b7db2bcf8c45991f Mon Sep 17 00:00:00 2001 From: root Date: Fri, 11 Sep 2020 10:08:55 +0000 Subject: [PATCH 3/5] xmp data processing added to the uploader --- cps/uploader.py | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/cps/uploader.py b/cps/uploader.py index dd37bf62..3747b24f 100644 --- a/cps/uploader.py +++ b/cps/uploader.py @@ -119,10 +119,36 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): if use_pdf_meta: with open(tmp_file_path, 'rb') as f: doc_info = PdfFileReader(f).getDocumentInfo() - if doc_info: - author = doc_info.author if doc_info.author else u'Unknown' - title = doc_info.title if doc_info.title else original_file_name - subject = doc_info.subject + xmp_info = PdfFileReader(f).getXmpMetadata() + if xmp_info: + xmp_author = xmp_info.dc_creator + if xmp_info.dc_title: + xmp_title = xmp_info.dc_title['x-default'] + else: + xmp_title = '' + if xmp_info.dc_description: + xmp_description = xmp_info.dc_description['x-default'] + else: + xmp_description = '' + if xmp_info.dc_subject: + xmp_tags = ', '.join(xmp_info.dc_subject) + else: + xmp_tags = '' + if xmp_info.dc_language: + xmp_language = ', '.join(xmp_info.dc_language) + else: + xmp_language='' + if xmp_info.dc_publisher: + xmp_publisher = ', '.join(xmp_info.dc_publisher) + else: + xmp_publisher='' + if xmp_info or doc_info: + author = xmp_author or split_authors([doc_info.author]) or u'Unknown' + title = xmp_title or doc_info.title or original_file_name + subject = xmp_description or doc_info.subject + publisher = xmp_publisher + tags = xmp_tags or doc_info['/Keywords'] + language = xmp_language else: author = u'Unknown' title = original_file_name @@ -132,13 +158,13 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): file_path=tmp_file_path, extension=original_file_extension, title=title, - author=' & '.join(split_authors([author])), + author=' & '.join(author), cover=pdf_preview(tmp_file_path, original_file_name), description=subject, - tags=doc_info['/Keywords'], + tags=tags, series="", series_id="", - languages="") + languages=language) def pdf_preview(tmp_file_path, tmp_dir): From 65929c02bc2150ad932fc61366cf5ca4b771c208 Mon Sep 17 00:00:00 2001 From: dickreckard Date: Fri, 11 Sep 2020 10:49:45 +0000 Subject: [PATCH 4/5] isolanguage parsing of xmp data --- cps/uploader.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/cps/uploader.py b/cps/uploader.py index 3747b24f..3f2b986a 100644 --- a/cps/uploader.py +++ b/cps/uploader.py @@ -22,7 +22,7 @@ import hashlib from tempfile import gettempdir from flask_babel import gettext as _ -from . import logger, comic +from . import logger, comic, isoLanguages from .constants import BookMeta from .helper import split_authors @@ -118,8 +118,9 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): doc_info = None if use_pdf_meta: with open(tmp_file_path, 'rb') as f: - doc_info = PdfFileReader(f).getDocumentInfo() - xmp_info = PdfFileReader(f).getXmpMetadata() + pdf_file = PdfFileReader(f) + doc_info = pdf_file.getDocumentInfo() + xmp_info = pdf_file.getXmpMetadata() if xmp_info: xmp_author = xmp_info.dc_creator if xmp_info.dc_title: @@ -130,25 +131,26 @@ def pdf_meta(tmp_file_path, original_file_name, original_file_extension): xmp_description = xmp_info.dc_description['x-default'] else: xmp_description = '' - if xmp_info.dc_subject: - xmp_tags = ', '.join(xmp_info.dc_subject) - else: - xmp_tags = '' - if xmp_info.dc_language: - xmp_language = ', '.join(xmp_info.dc_language) - else: - xmp_language='' - if xmp_info.dc_publisher: - xmp_publisher = ', '.join(xmp_info.dc_publisher) - else: - xmp_publisher='' + xmp_tags = ', '.join(xmp_info.dc_subject) + xmp_language = xmp_info.dc_language[0] + xmp_publisher = ', '.join(xmp_info.dc_publisher) + if xmp_info or doc_info: author = xmp_author or split_authors([doc_info.author]) or u'Unknown' title = xmp_title or doc_info.title or original_file_name subject = xmp_description or doc_info.subject publisher = xmp_publisher tags = xmp_tags or doc_info['/Keywords'] - language = xmp_language + if xmp_language : + lang = xmp_language.split('-', 1)[0].lower() + if len(lang) == 2: + language = isoLanguages.get(part1=lang).name + elif len(lang) == 3: + language = isoLanguages.get(part3=lang).name + else: + language = '' + else: + language = '' else: author = u'Unknown' title = original_file_name From 8abfaf0ffd28cbb4d4e33bcf854e7aa2efda86fb Mon Sep 17 00:00:00 2001 From: rra Date: Tue, 16 Mar 2021 17:53:33 +0100 Subject: [PATCH 5/5] Parse XMP metadata in separate function, add exception, try multiple metadata formats --- cps/uploader.py | 97 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 31 deletions(-) diff --git a/cps/uploader.py b/cps/uploader.py index 246b3bb9..a50e22ac 100644 --- a/cps/uploader.py +++ b/cps/uploader.py @@ -22,11 +22,10 @@ import hashlib from tempfile import gettempdir from flask_babel import gettext as _ -from . import logger, comic, isoLanguages +from . import logger, comic, isoLanguages, get_locale from .constants import BookMeta from .helper import split_authors - log = logger.create() @@ -82,7 +81,7 @@ def process(tmp_file_path, original_file_name, original_file_extension, rarExecu original_file_name, original_file_extension, rarExecutable) - except Exception as ex: + except Exception as ex: log.warning('cannot parse metadata, using default: %s', ex) if meta and meta.title.strip() and meta.author.strip(): @@ -106,59 +105,95 @@ def default_meta(tmp_file_path, original_file_name, original_file_extension): languages="") -def pdf_meta(tmp_file_path, original_file_name, original_file_extension): - doc_info = None - if use_pdf_meta: - with open(tmp_file_path, 'rb') as f: - pdf_file = PdfFileReader(f) - doc_info = pdf_file.getDocumentInfo() - xmp_info = pdf_file.getXmpMetadata() +def parse_xmp(pdf_file): + """ + Parse XMP Metadata and prepare for BookMeta object + """ + try: + xmp_info = pdf_file.getXmpMetadata() + except Exception as e: + log.debug('Can not read XMP metadata', e) + return None + if xmp_info: - xmp_author = xmp_info.dc_creator + try: + xmp_author = xmp_info.dc_creator # list + except: + xmp_author = ['Unknown'] + if xmp_info.dc_title: xmp_title = xmp_info.dc_title['x-default'] else: xmp_title = '' + if xmp_info.dc_description: xmp_description = xmp_info.dc_description['x-default'] else: xmp_description = '' + + languages = [] + for i in xmp_info.dc_language: + #calibre-web currently only takes one language. + languages.append(isoLanguages.get_lang3(i)) + xmp_tags = ', '.join(xmp_info.dc_subject) - xmp_language = xmp_info.dc_language[0] xmp_publisher = ', '.join(xmp_info.dc_publisher) + xmp_languages = xmp_info.dc_language + + return {'author': xmp_author, + 'title': xmp_title, + 'subject': xmp_description, + 'tags': xmp_tags, 'languages': languages, + 'publisher': xmp_publisher + } + + +def pdf_meta(tmp_file_path, original_file_name, original_file_extension): + doc_info = None + xmp_info = None + + if use_pdf_meta: + with open(tmp_file_path, 'rb') as f: + pdf_file = PdfFileReader(f) + doc_info = pdf_file.getDocumentInfo() + xmp_info = parse_xmp(pdf_file) + + if xmp_info: + author = ' & '.join(split_authors(xmp_info['author'])) + title = xmp_info['title'] + subject = xmp_info['subject'] + tags = xmp_info['tags'] + languages = xmp_info['languages'] + publisher = xmp_info['publisher'] + + elif doc_info: + author = ' & '.join(split_authors([doc_info.author])) + title = doc_info.title + subject = doc_info.subject + tags = doc_info['/Keywords'] + languages = "" + publisher = "" - if xmp_info or doc_info: - author = xmp_author or split_authors([doc_info.author]) or u'Unknown' - title = xmp_title or doc_info.title or original_file_name - subject = xmp_description or doc_info.subject - publisher = xmp_publisher - tags = xmp_tags or doc_info['/Keywords'] - if xmp_language : - lang = xmp_language.split('-', 1)[0].lower() - if len(lang) == 2: - language = isoLanguages.get(part1=lang).name - elif len(lang) == 3: - language = isoLanguages.get(part3=lang).name - else: - language = '' - else: - language = '' else: - author = u'Unknown' + author= u'Unknown' title = original_file_name subject = "" + tags = "" + languages = "" + publisher = "" return BookMeta( file_path=tmp_file_path, extension=original_file_extension, title=title, - author=' & '.join(author), + author=author, cover=pdf_preview(tmp_file_path, original_file_name), description=subject, tags=tags, series="", series_id="", - languages=language) + languages=', '.join(languages) + ) def pdf_preview(tmp_file_path, tmp_dir):