From 0989195008bb903a3eb2bf3b214cb445dc401634 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 6 Jun 2021 04:31:12 +0800 Subject: [PATCH 1/5] javbus:fix uri --- WebCrawler/javbus.py | 8 +++++--- core.py | 29 ++++++++++++++++++----------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 0082521..aaa3dbd 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -47,9 +47,10 @@ def getYear(htmlcode): #获取年份 def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') - if not "javbus.com" in image.attr('href'): - return "https://www.javbus.com" + image.attr('href') - return image.attr('href') + uri = image.attr('href') + if uri[0] == '/': + return "https://www.javbus.com" + uri + return uri def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") @@ -193,3 +194,4 @@ def main(number): if __name__ == "__main__" : print(main('ipx-292')) + print(main('CEMD-011')) diff --git a/core.py b/core.py index 5b7f595..dc64cc2 100644 --- a/core.py +++ b/core.py @@ -4,6 +4,7 @@ import pathlib import re import shutil import platform +import errno from PIL import Image from io import BytesIO @@ -49,7 +50,7 @@ def moveFailedFolder(filepath, failed_folder): def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON返回元数据 """ - iterate through all services and fetch the data + iterate through all services and fetch the data """ func_mapping = { @@ -150,17 +151,17 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON cover_small = '' else: cover_small = json_data.get('cover_small') - + if json_data.get('trailer') == None: trailer = '' else: trailer = json_data.get('trailer') - + if json_data.get('extrafanart') == None: extrafanart = '' else: extrafanart = json_data.get('extrafanart') - + imagecut = json_data.get('imagecut') tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') @@ -226,7 +227,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON studio = re.sub('.*/妄想族','妄想族',studio) studio = studio.replace('/',' ') # === 替换Studio片假名 END - + location_rule = eval(conf.location_rule()) if 'actor' in conf.location_rule() and len(actor) > 100: @@ -277,7 +278,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON json_data['trailer'] = '' else: json_data['trailer'] = '' - + if conf.is_extrafanart(): if extrafanart: json_data['extrafanart'] = extrafanart @@ -285,7 +286,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON json_data['extrafanart'] = '' else: json_data['extrafanart'] = '' - + naming_rule="" for i in conf.naming_rule().split("+"): if i not in json_data: @@ -644,6 +645,9 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config except PermissionError: print('[-]Error! Please run as administrator!') return + except OSError as oserr: + print('[-]OS Error errno ' + oserr.errno) + return def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf): # 文件路径,番号,后缀,要移动至的位置 @@ -656,7 +660,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo os.symlink(filepath, path + '/' + number + part + leak_word + c_word + houzhui) else: os.rename(filepath, path + '/' + number + part + leak_word + c_word + houzhui) - + sub_res = conf.sub_rule() for subname in sub_res: if os.path.exists(filepath.replace(houzhui, subname)): # 字幕移动 @@ -671,6 +675,9 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo except PermissionError: print('[-]Error! Please run as administrator!') return + except OSError as oserr: + print('[-]OS Error errno ' + oserr.errno) + return def get_part(filepath, failed_folder): try: @@ -742,8 +749,8 @@ def core_main(file_path, number_th, conf: config.Config): uncensored = 1 else: uncensored = 0 - - + + if '流出' in filepath or 'uncensored' in filepath: liuchu = '流出' leak = 1 @@ -795,7 +802,7 @@ def core_main(file_path, number_th, conf: config.Config): # 移动文件 paste_file_to_folder(filepath, path, number, leak_word, c_word, conf) - + poster_path = path + '/' + number + leak_word + c_word + '-poster.jpg' thumb_path = path + '/' + number + leak_word + c_word + '-thumb.jpg' if conf.is_watermark(): From 5abeb360afaecdea6b858682b264d17ba0f6d6d5 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 6 Jun 2021 04:46:59 +0800 Subject: [PATCH 2/5] if not start with http --- WebCrawler/javbus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index aaa3dbd..82b9a59 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -48,7 +48,7 @@ def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') uri = image.attr('href') - if uri[0] == '/': + if uri[0:4] != 'http': return "https://www.javbus.com" + uri return uri def getRelease(htmlcode): #获取出版日期 From e2cd1f09df480fbd8dbf85613569dd08a0b1e976 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 6 Jun 2021 04:57:24 +0800 Subject: [PATCH 3/5] check uri not start with / --- WebCrawler/javbus.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 82b9a59..de9d264 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -49,7 +49,10 @@ def getCover(htmlcode): #获取封面链接 image = doc('a.bigImage') uri = image.attr('href') if uri[0:4] != 'http': - return "https://www.javbus.com" + uri + if uri[0] == '/': + return "https://www.javbus.com" + uri + else: + return "https://www.javbus.com/" + uri return uri def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) From ac22fcdf057d5a90c2a10d69371f4f7067c31807 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 6 Jun 2021 05:06:42 +0800 Subject: [PATCH 4/5] optimize code logic --- WebCrawler/javbus.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index de9d264..a8aa93c 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -48,12 +48,11 @@ def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') uri = image.attr('href') - if uri[0:4] != 'http': - if uri[0] == '/': - return "https://www.javbus.com" + uri - else: - return "https://www.javbus.com/" + uri - return uri + if uri[0:4] == 'http': + return uri + if uri[0] != '/': + uri = '/' + uri + return "https://www.javbus.com" + uri def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") From 9072f8b5ecf7fa0a8dbc6472fe9e11887beab09e Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 6 Jun 2021 05:14:13 +0800 Subject: [PATCH 5/5] use str startswith() --- WebCrawler/javbus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index a8aa93c..d89c9dc 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -48,7 +48,7 @@ def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') uri = image.attr('href') - if uri[0:4] == 'http': + if uri.startswith('http'): return uri if uri[0] != '/': uri = '/' + uri