diff --git a/ADC_function.py b/ADC_function.py index 2b48e0b..746384c 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -1,121 +1,121 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import requests -from configparser import ConfigParser -import os -import re -import time -import sys -from lxml import etree -import sys -import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -# sys.setdefaultencoding('utf-8') - -config_file='config.ini' -config = ConfigParser() - -if os.path.exists(config_file): - try: - config.read(config_file, encoding='UTF-8') - except: - print('[-]Config.ini read failed! Please use the offical file!') -else: - print('[+]config.ini: not found, creating...',end='') - with open("config.ini", "wt", encoding='UTF-8') as code: - print("[common]", file=code) - print("main_mode = 1", file=code) - print("failed_output_folder = failed", file=code) - print("success_output_folder = JAV_output", file=code) - print("", file=code) - print("[proxy]",file=code) - print("proxy=127.0.0.1:1081",file=code) - print("timeout=10", file=code) - print("retry=3", file=code) - print("", file=code) - print("[Name_Rule]", file=code) - print("location_rule=actor+'/'+number",file=code) - print("naming_rule=number+'-'+title",file=code) - print("", file=code) - print("[update]",file=code) - print("update_check=1",file=code) - print("", file=code) - print("[media]", file=code) - print("media_warehouse=emby", file=code) - print("#emby plex kodi", file=code) - print("", file=code) - print("[escape]", file=code) - print("literals=\\", file=code) - print("", file=code) - print("[movie_location]", file=code) - print("path=", file=code) - print("", file=code) - print('.',end='') - time.sleep(2) - print('.') - print('[+]config.ini: created!') - print('[+]Please restart the program!') - time.sleep(4) - os._exit(0) - try: - config.read(config_file, encoding='UTF-8') - except: - print('[-]Config.ini read failed! Please use the offical file!') - -def get_network_settings(): - try: - proxy = config["proxy"]["proxy"] - timeout = int(config["proxy"]["timeout"]) - retry_count = int(config["proxy"]["retry"]) - assert timeout > 0 - assert retry_count > 0 - except: - raise ValueError("[-]Proxy config error! Please check the config.") - return proxy, timeout, retry_count - -def getDataState(json_data): # 元数据获取失败检测 - if json_data['title'] == '' or json_data['title'] == 'None' or json_data['title'] == 'null': - return 0 - else: - return 1 - -def ReadMediaWarehouse(): - return config['media']['media_warehouse'] - -def UpdateCheckSwitch(): - check=str(config['update']['update_check']) - if check == '1': - return '1' - elif check == '0': - return '0' - elif check == '': - return '0' - -def getXpathSingle(htmlcode,xpath): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result1 = str(html.xpath(xpath)).strip(" ['']") - return result1 - -def get_html(url,cookies = None):#网页请求核心 - proxy, timeout, retry_count = get_network_settings() - i = 0 - while i < retry_count: - try: - if not proxy == '': - proxies = {"http": "http://" + proxy,"https": "https://" + proxy} - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'} - getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies) - getweb.encoding = 'utf-8' - return getweb.text - else: - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies) - getweb.encoding = 'utf-8' - return getweb.text - except: - i += 1 - print('[-]Connect retry '+str(i)+'/'+str(retry_count)) - print('[-]Connect Failed! Please check your Proxy or Network!') - - +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import requests +from configparser import ConfigParser +import os +import re +import time +import sys +from lxml import etree +import sys +import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) +# sys.setdefaultencoding('utf-8') + +config_file='config.ini' +config = ConfigParser() + +if os.path.exists(config_file): + try: + config.read(config_file, encoding='UTF-8') + except: + print('[-]Config.ini read failed! Please use the offical file!') +else: + print('[+]config.ini: not found, creating...',end='') + with open("config.ini", "wt", encoding='UTF-8') as code: + print("[common]", file=code) + print("main_mode = 1", file=code) + print("failed_output_folder = failed", file=code) + print("success_output_folder = JAV_output", file=code) + print("", file=code) + print("[proxy]",file=code) + print("proxy=127.0.0.1:1081",file=code) + print("timeout=10", file=code) + print("retry=3", file=code) + print("", file=code) + print("[Name_Rule]", file=code) + print("location_rule=actor+'/'+number",file=code) + print("naming_rule=number+'-'+title",file=code) + print("", file=code) + print("[update]",file=code) + print("update_check=1",file=code) + print("", file=code) + print("[media]", file=code) + print("media_warehouse=emby", file=code) + print("#emby plex kodi", file=code) + print("", file=code) + print("[escape]", file=code) + print("literals=\\", file=code) + print("", file=code) + print("[movie_location]", file=code) + print("path=", file=code) + print("", file=code) + print('.',end='') + time.sleep(2) + print('.') + print('[+]config.ini: created!') + print('[+]Please restart the program!') + time.sleep(4) + os._exit(0) + try: + config.read(config_file, encoding='UTF-8') + except: + print('[-]Config.ini read failed! Please use the offical file!') + +def get_network_settings(): + try: + proxy = config["proxy"]["proxy"] + timeout = int(config["proxy"]["timeout"]) + retry_count = int(config["proxy"]["retry"]) + assert timeout > 0 + assert retry_count > 0 + except: + raise ValueError("[-]Proxy config error! Please check the config.") + return proxy, timeout, retry_count + +def getDataState(json_data): # 元数据获取失败检测 + if json_data['title'] == '' or json_data['title'] == 'None' or json_data['title'] == 'null': + return 0 + else: + return 1 + +def ReadMediaWarehouse(): + return config['media']['media_warehouse'] + +def UpdateCheckSwitch(): + check=str(config['update']['update_check']) + if check == '1': + return '1' + elif check == '0': + return '0' + elif check == '': + return '0' + +def getXpathSingle(htmlcode,xpath): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result1 = str(html.xpath(xpath)).strip(" ['']") + return result1 + +def get_html(url,cookies = None):#网页请求核心 + proxy, timeout, retry_count = get_network_settings() + i = 0 + while i < retry_count: + try: + if not proxy == '': + proxies = {"http": "http://" + proxy,"https": "https://" + proxy} + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'} + getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies) + getweb.encoding = 'utf-8' + return getweb.text + else: + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies) + getweb.encoding = 'utf-8' + return getweb.text + except: + i += 1 + print('[-]Connect retry '+str(i)+'/'+str(retry_count)) + print('[-]Connect Failed! Please check your Proxy or Network!') + + diff --git a/avsox.py b/avsox.py index 67ee9bf..e54d8d1 100644 --- a/avsox.py +++ b/avsox.py @@ -1,115 +1,115 @@ -import re -from lxml import etree -import json -from bs4 import BeautifulSoup -from ADC_function import * -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'avatar-box'}) - d = {} - for i in a: - l = i.img['src'] - t = i.span.get_text() - p2 = {t: l} - d.update(p2) - return d -def getTitle(a): - try: - html = etree.fromstring(a, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] - return result.replace('/', '') - except: - return '' -def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - soup = BeautifulSoup(a, 'lxml') - a = soup.find_all(attrs={'class': 'avatar-box'}) - d = [] - for i in a: - d.append(i.span.get_text()) - return d -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') - return result1 -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") - return result1 -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") - return result1 -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") - return result1 -def getYear(release): - try: - result = str(re.search('\d{4}',release).group()) - return result - except: - return release -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") - return result1 -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") - return result -def getCover_small(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") - return result -def getTag(a): # 获取演员 - soup = BeautifulSoup(a, 'lxml') - a = soup.find_all(attrs={'class': 'genre'}) - d = [] - for i in a: - d.append(i.get_text()) - return d - -def main(number): - a = get_html('https://avsox.host/cn/search/' + number) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) - print(a) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) - print(a) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - web = get_html(result1) - soup = BeautifulSoup(web, 'lxml') - info = str(soup.find(attrs={'class': 'row movie'})) - dic = { - 'actor': getActor(web), - 'title': getTitle(web).strip(getNum(web)), - 'studio': getStudio(info), - 'outline': '',# - 'runtime': getRuntime(info), - 'director': '', # - 'release': getRelease(info), - 'number': getNum(info), - 'cover': getCover(web), - 'cover_small': getCover_small(a), - 'imagecut': 3, - 'tag': getTag(web), - 'label': getLabel(info), - 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(web), - 'website': result1, - 'source': 'avsox.py', - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find_all(attrs={'class': 'avatar-box'}) + d = {} + for i in a: + l = i.img['src'] + t = i.span.get_text() + p2 = {t: l} + d.update(p2) + return d +def getTitle(a): + try: + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] + return result.replace('/', '') + except: + return '' +def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + soup = BeautifulSoup(a, 'lxml') + a = soup.find_all(attrs={'class': 'avatar-box'}) + d = [] + for i in a: + d.append(i.span.get_text()) + return d +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') + return result1 +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") + return result1 +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") + return result1 +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") + return result1 +def getYear(release): + try: + result = str(re.search('\d{4}',release).group()) + return result + except: + return release +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") + return result1 +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") + return result +def getCover_small(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") + return result +def getTag(a): # 获取演员 + soup = BeautifulSoup(a, 'lxml') + a = soup.find_all(attrs={'class': 'genre'}) + d = [] + for i in a: + d.append(i.get_text()) + return d + +def main(number): + a = get_html('https://avsox.host/cn/search/' + number) + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + if result1 == '' or result1 == 'null' or result1 == 'None': + a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) + print(a) + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + if result1 == '' or result1 == 'null' or result1 == 'None': + a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) + print(a) + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + web = get_html(result1) + soup = BeautifulSoup(web, 'lxml') + info = str(soup.find(attrs={'class': 'row movie'})) + dic = { + 'actor': getActor(web), + 'title': getTitle(web).strip(getNum(web)), + 'studio': getStudio(info), + 'outline': '',# + 'runtime': getRuntime(info), + 'director': '', # + 'release': getRelease(info), + 'number': getNum(info), + 'cover': getCover(web), + 'cover_small': getCover_small(a), + 'imagecut': 3, + 'tag': getTag(web), + 'label': getLabel(info), + 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': getActorPhoto(web), + 'website': result1, + 'source': 'avsox.py', + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + #print(main('012717_472')) \ No newline at end of file diff --git a/core.py b/core.py index 3ad271b..8860911 100755 --- a/core.py +++ b/core.py @@ -1,691 +1,691 @@ -# -*- coding: utf-8 -*- - -import re -import os -import os.path -import shutil -from PIL import Image -import time -import json -from ADC_function import * -from configparser import ConfigParser -import argparse -# =========website======== -import fc2fans_club -import mgstage -import avsox -import javbus -import javdb -import fanza -import requests -import random - - -# =====================本地文件处理=========================== - -def escapePath(path, Config): # Remove escape literals - escapeLiterals = Config['escape']['literals'] - backslash = '\\' - for literal in escapeLiterals: - path = path.replace(backslash + literal, '') - return path - - -def moveFailedFolder(filepath, failed_folder): - print('[-]Move to Failed output folder') - shutil.move(filepath, str(os.getcwd()) + '/' + failed_folder + '/') - return - - -def CreatFailedFolder(failed_folder): - if not os.path.exists(failed_folder + '/'): # 新建failed文件夹 - try: - os.makedirs(failed_folder + '/') - except: - print("[-]failed!can not be make Failed output folder\n[-](Please run as Administrator)") - return - - -def getDataFromJSON(file_number, filepath, failed_folder): # 从JSON返回元数据 - """ - iterate through all services and fetch the data - """ - - func_mapping = { - "avsox": avsox.main, - "fc2": fc2fans_club.main, - "fanza": fanza.main, - "javdb": javdb.main, - "javbus": javbus.main, - "mgstage": mgstage.main, - } - - # default fetch order list, from the begining to the end - sources = ["javbus", "javdb", "fanza", "mgstage", "fc2", "avsox"] - - # if the input file name matches centain rules, - # move some web service to the begining of the list - if re.match(r"^\d{5,}", file_number) or ( - "HEYZO" in file_number or "heyzo" in file_number or "Heyzo" in file_number - ): - sources.insert(0, sources.pop(sources.index("avsox"))) - elif re.match(r"\d+\D+", file_number) or ( - "siro" in file_number or "SIRO" in file_number or "Siro" in file_number - ): - sources.insert(0, sources.pop(sources.index("fanza"))) - elif "fc2" in file_number or "FC2" in file_number: - sources.insert(0, sources.pop(sources.index("fc2"))) - - for source in sources: - json_data = json.loads(func_mapping[source](file_number)) - # if any service return a valid return, break - if getDataState(json_data) != 0: - break - - # ================================================网站规则添加结束================================================ - - title = json_data['title'] - actor_list = str(json_data['actor']).strip("[ ]").replace("'", '').split(',') # 字符串转列表 - release = json_data['release'] - number = json_data['number'] - studio = json_data['studio'] - source = json_data['source'] - runtime = json_data['runtime'] - outline = json_data['runtime'] - label = json_data['label'] - year = json_data['year'] - try: - cover_small = json_data['cover_small'] - except: - cover_small = '' - imagecut = json_data['imagecut'] - tag = str(json_data['tag']).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ - actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') - - - if title == '' or number == '': - print('[-]Movie Data not found!') - moveFailedFolder(filepath, failed_folder) - return - - # if imagecut == '3': - # DownloadFileWithFilename() - - # ====================处理异常字符====================== #\/:*?"<>| - title = title.replace('\\', '') - title = title.replace('/', '') - title = title.replace(':', '') - title = title.replace('*', '') - title = title.replace('?', '') - title = title.replace('"', '') - title = title.replace('<', '') - title = title.replace('>', '') - title = title.replace('|', '') - release = release.replace('/', '-') - tmpArr = cover_small.split(',') - if len(tmpArr) > 0: - cover_small = tmpArr[0].strip('\"').strip('\'') - # ====================处理异常字符 END================== #\/:*?"<>| - - naming_rule = eval(config['Name_Rule']['naming_rule']) - location_rule = eval(config['Name_Rule']['location_rule']) - if 'actor' in config['Name_Rule']['location_rule'] and len(actor) > 100: - print(config['Name_Rule']['location_rule']) - location_rule = eval(config['Name_Rule']['location_rule'].replace("actor","'多人作品'")) - if 'title' in config['Name_Rule']['location_rule'] and len(title) > 100: - location_rule = eval(config['Name_Rule']['location_rule'].replace("title",'number')) - - # 返回处理后的json_data - json_data['title'] = title - json_data['actor'] = actor - json_data['release'] = release - json_data['cover_small'] = cover_small - json_data['tag'] = tag - json_data['naming_rule'] = naming_rule - json_data['location_rule'] = location_rule - json_data['year'] = year - json_data['actor_list'] = actor_list - return json_data - - -def get_info(json_data): # 返回json里的数据 - title = json_data['title'] - studio = json_data['studio'] - year = json_data['year'] - outline = json_data['outline'] - runtime = json_data['runtime'] - director = json_data['director'] - actor_photo = json_data['actor_photo'] - release = json_data['release'] - number = json_data['number'] - cover = json_data['cover'] - website = json_data['website'] - return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website - - -def smallCoverCheck(path, number, imagecut, cover_small, c_word, option, Config, filepath, failed_folder): - if imagecut == 3: - if option == 'emby': - DownloadFileWithFilename(cover_small, '1.jpg', path, Config, filepath, failed_folder) - try: - img = Image.open(path + '/1.jpg') - except Exception: - img = Image.open('1.jpg') - w = img.width - h = img.height - img.save(path + '/' + number + c_word + '.png') - time.sleep(1) - os.remove(path + '/1.jpg') - if option == 'kodi': - DownloadFileWithFilename(cover_small, '1.jpg', path, Config, filepath, failed_folder) - try: - img = Image.open(path + '/1.jpg') - except Exception: - img = Image.open('1.jpg') - w = img.width - h = img.height - img.save(path + '/' + number + c_word + '-poster.jpg') - time.sleep(1) - os.remove(path + '/1.jpg') - if option == 'plex': - DownloadFileWithFilename(cover_small, '1.jpg', path, Config, filepath, failed_folder) - try: - img = Image.open(path + '/1.jpg') - except Exception: - img = Image.open('1.jpg') - w = img.width - h = img.height - img.save(path + '/poster.jpg') - os.remove(path + '/1.jpg') - - -def creatFolder(success_folder, location_rule, json_data, Config): # 创建文件夹 - title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website= get_info(json_data) - if len(location_rule) > 240: # 新建成功输出文件夹 - path = success_folder + '/' + location_rule.replace("'actor'", "'manypeople'", 3).replace("actor", - "'manypeople'", - 3) # path为影片+元数据所在目录 - else: - path = success_folder + '/' + location_rule - # print(path) - if not os.path.exists(path): - path = escapePath(path, Config) - try: - os.makedirs(path) - except: - path = success_folder + '/' + location_rule.replace('/[' + number + ']-' + title, "/number") - path = escapePath(path, Config) - - os.makedirs(path) - return path - - -# =====================资源下载部分=========================== -def DownloadFileWithFilename(url, filename, path, Config, filepath, failed_folder): # path = examle:photo , video.in the Project Folder! - proxy, timeout, retry_count = get_network_settings() - i = 0 - - while i < retry_count: - try: - if not proxy == '': - if not os.path.exists(path): - os.makedirs(path) - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - r = requests.get(url, headers=headers, timeout=timeout, - proxies={"http": "http://" + str(proxy), "https": "https://" + str(proxy)}) - if r == '': - print('[-]Movie Data not found!') - return - with open(str(path) + "/" + filename, "wb") as code: - code.write(r.content) - return - else: - if not os.path.exists(path): - os.makedirs(path) - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - r = requests.get(url, timeout=timeout, headers=headers) - if r == '': - print('[-]Movie Data not found!') - return - with open(str(path) + "/" + filename, "wb") as code: - code.write(r.content) - return - except requests.exceptions.RequestException: - i += 1 - print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) - except requests.exceptions.ConnectionError: - i += 1 - print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) - except requests.exceptions.ProxyError: - i += 1 - print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) - except requests.exceptions.ConnectTimeout: - i += 1 - print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) - print('[-]Connect Failed! Please check your Proxy or Network!') - moveFailedFolder(filepath, failed_folder) - return - - -def imageDownload(option, cover, number, c_word, path, multi_part, Config, filepath, failed_folder): # 封面是否下载成功,否则移动到failed - if option == 'emby': - if DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) == 'failed': - moveFailedFolder(filepath, failed_folder) - return - DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) - if not os.path.getsize(path + '/' + number + c_word + '.jpg') == 0: - print('[+]Image Downloaded!', path + '/' + number + c_word + '.jpg') - return - i = 1 - while i <= int(config['proxy']['retry']): - if os.path.getsize(path + '/' + number + c_word + '.jpg') == 0: - print('[!]Image Download Failed! Trying again. [' + config['proxy']['retry'] + '/3]') - DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) - i = i + 1 - continue - else: - break - if multi_part == 1: - old_name = os.path.join(path, number + c_word + '.jpg') - new_name = os.path.join(path, number + c_word + '.jpg') - os.rename(old_name, new_name) - print('[+]Image Downloaded!', path + '/' + number + c_word + '.jpg') - else: - print('[+]Image Downloaded!', path + '/' + number + c_word + '.jpg') - elif option == 'plex': - if DownloadFileWithFilename(cover, 'fanart.jpg', path, Config, filepath, failed_folder) == 'failed': - moveFailedFolder(filepath, failed_folder) - return - DownloadFileWithFilename(cover, 'fanart.jpg', path, Config, filepath, failed_folder) - if not os.path.getsize(path + '/fanart.jpg') == 0: - print('[+]Image Downloaded!', path + '/fanart.jpg') - return - i = 1 - while i <= int(config['proxy']['retry']): - if os.path.getsize(path + '/fanart.jpg') == 0: - print('[!]Image Download Failed! Trying again. [' + config['proxy']['retry'] + '/3]') - DownloadFileWithFilename(cover, 'fanart.jpg', path, Config, filepath, failed_folder) - i = i + 1 - continue - else: - break - if not os.path.getsize(path + '/' + number + c_word + '.jpg') == 0: - print('[!]Image Download Failed! Trying again.') - DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) - print('[+]Image Downloaded!', path + '/fanart.jpg') - elif option == 'kodi': - if DownloadFileWithFilename(cover, number + c_word + '-fanart.jpg', path, Config, filepath, failed_folder) == 'failed': - moveFailedFolder(filepath, failed_folder) - return - DownloadFileWithFilename(cover, number + c_word + '-fanart.jpg', path, Config, filepath, failed_folder) - if not os.path.getsize(path + '/' + number + c_word + '-fanart.jpg') == 0: - print('[+]Image Downloaded!', path + '/' + number + c_word + '-fanart.jpg') - return - i = 1 - while i <= int(config['proxy']['retry']): - if os.path.getsize(path + '/' + number + c_word + '-fanart.jpg') == 0: - print('[!]Image Download Failed! Trying again. [' + config['proxy']['retry'] + '/3]') - DownloadFileWithFilename(cover, number + c_word + '-fanart.jpg', path, Config, filepath, failed_folder) - i = i + 1 - continue - else: - break - print('[+]Image Downloaded!', path + '/' + number + c_word + '-fanart.jpg') - - -def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filepath, failed_folder, tag, actor_list): - title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website = get_info(json_data) - try: - if not os.path.exists(path): - os.makedirs(path) - if option == 'plex': - with open(path + "/" + number + c_word + ".nfo", "wt", encoding='UTF-8') as code: - print('', file=code) - print("", file=code) - print(" " + naming_rule + part + "", file=code) - print(" ", file=code) - print(" ", file=code) - print(" " + studio + "+", file=code) - print(" " + year + "", file=code) - print(" " + outline + "", file=code) - print(" " + outline + "", file=code) - print(" " + str(runtime).replace(" ", "") + "", file=code) - print(" " + director + "", file=code) - print(" poster.jpg", file=code) - print(" thumb.png", file=code) - print(" fanart.jpg", file=code) - try: - for key in actor_list: - print(" ", file=code) - print(" " + key + "", file=code) - print(" ", file=code) - except: - aaaa = '' - print(" " + studio + "", file=code) - print(" ", file=code) - if cn_sub == '1': - print(" 中文字幕", file=code) - try: - for i in str(json_data['tag']).strip("[ ]").replace("'", '').replace(" ", '').split(','): - print(" " + i + "", file=code) - except: - aaaaa = '' - try: - for i in str(json_data['tag']).strip("[ ]").replace("'", '').replace(" ", '').split(','): - print(" " + i + "", file=code) - except: - aaaaaaaa = '' - if cn_sub == '1': - print(" 中文字幕", file=code) - print(" " + number + "", file=code) - print(" " + release + "", file=code) - print(" " + cover + "", file=code) - print(" " + website + "", file=code) - print("", file=code) - print("[+]Writeed! " + path + "/" + number + ".nfo") - elif option == 'emby': - with open(path + "/" + number + c_word + ".nfo", "wt", encoding='UTF-8') as code: - print('', file=code) - print("", file=code) - print(" " + naming_rule + part + "", file=code) - print(" ", file=code) - print(" ", file=code) - print(" " + studio + "+", file=code) - print(" " + year + "", file=code) - print(" " + outline + "", file=code) - print(" " + outline + "", file=code) - print(" " + str(runtime).replace(" ", "") + "", file=code) - print(" " + director + "", file=code) - print(" " + number + c_word + ".png", file=code) - print(" " + number + c_word + ".png", file=code) - print(" " + number + c_word + '.jpg' + "", file=code) - try: - for key in actor_list: - print(" ", file=code) - print(" " + key + "", file=code) - print(" ", file=code) - except: - aaaa = '' - print(" " + studio + "", file=code) - print(" ", file=code) - if cn_sub == '1': - print(" 中文字幕", file=code) - try: - for i in tag: - print(" " + i + "", file=code) - except: - aaaaa = '' - try: - for i in tag: - print(" " + i + "", file=code) - except: - aaaaaaaa = '' - if cn_sub == '1': - print(" 中文字幕", file=code) - print(" " + number + "", file=code) - print(" " + release + "", file=code) - print(" " + cover + "", file=code) - print(" " + website + "", file=code) - print("", file=code) - print("[+]Writeed! " + path + "/" + number + c_word + ".nfo") - elif option == 'kodi': - with open(path + "/" + number + c_word + ".nfo", "wt", encoding='UTF-8') as code: - print('', file=code) - print("", file=code) - print(" " + naming_rule + part + "", file=code) - print(" ", file=code) - print(" ", file=code) - print(" " + studio + "+", file=code) - print(" " + year + "", file=code) - print(" " + outline + "", file=code) - print(" " + outline + "", file=code) - print(" " + str(runtime).replace(" ", "") + "", file=code) - print(" " + director + "", file=code) - print(" " + number + c_word + "-poster.jpg", file=code) - print(" " + number + c_word + '-fanart.jpg' + "", file=code) - try: - for key in actor_list: - print(" ", file=code) - print(" " + key + "", file=code) - print(" ", file=code) - except: - aaaa = '' - print(" " + studio + "", file=code) - print(" ", file=code) - if cn_sub == '1': - print(" 中文字幕", file=code) - try: - for i in tag: - print(" " + i + "", file=code) - except: - aaaaa = '' - try: - for i in tag: - print(" " + i + "", file=code) - except: - aaaaaaaa = '' - if cn_sub == '1': - print(" 中文字幕", file=code) - print(" " + number + "", file=code) - print(" " + release + "", file=code) - print(" " + cover + "", file=code) - print(" " + website + "", file=code) - print("", file=code) - print("[+]Writeed! " + path + "/" + number + c_word + ".nfo") - except IOError as e: - print("[-]Write Failed!") - print(e) - moveFailedFolder(filepath, failed_folder) - return - except Exception as e1: - print(e1) - print("[-]Write Failed!") - moveFailedFolder(filepath, failed_folder) - return - - -def cutImage(option, imagecut, path, number, c_word): - if option == 'plex': - if imagecut == 1: - try: - img = Image.open(path + '/fanart.jpg') - imgSize = img.size - w = img.width - h = img.height - img2 = img.crop((w / 1.9, 0, w, h)) - img2.save(path + '/poster.jpg') - except: - print('[-]Cover cut failed!') - elif imagecut == 0: - img = Image.open(path + '/fanart.jpg') - w = img.width - h = img.height - img.save(path + '/poster.jpg') - elif option == 'emby': - if imagecut == 1: - try: - img = Image.open(path + '/' + number + c_word + '.jpg') - imgSize = img.size - w = img.width - h = img.height - img2 = img.crop((w / 1.9, 0, w, h)) - img2.save(path + '/' + number + c_word + '.png') - except: - print('[-]Cover cut failed!') - elif imagecut == 0: - img = Image.open(path + '/' + number + c_word + '.jpg') - w = img.width - h = img.height - img.save(path + '/' + number + c_word + '.png') - elif option == 'kodi': - if imagecut == 1: - try: - img = Image.open(path + '/' + number + c_word + '-fanart.jpg') - imgSize = img.size - w = img.width - h = img.height - img2 = img.crop((w / 1.9, 0, w, h)) - img2.save(path + '/' + number + c_word + '-poster.jpg') - except: - print('[-]Cover cut failed!') - elif imagecut == 0: - img = Image.open(path + '/' + number + c_word + '-fanart.jpg') - w = img.width - h = img.height - try: - img = img.convert('RGB') - img.save(path + '/' + number + c_word + '-poster.jpg') - except: - img = img.convert('RGB') - img.save(path + '/' + number + c_word + '-poster.jpg') - - -def pasteFileToFolder(filepath, path, number, c_word): # 文件路径,番号,后缀,要移动至的位置 - houzhui = str(re.search('[.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|TS|WEBM|avi|rmvb|wmv|mov|mp4|mkv|flv|ts|webm)$', filepath).group()) - try: - if config['common']['soft_link'] == '1': # 如果soft_link=1 使用软链接 - os.symlink(filepath, path + '/' + number + c_word + houzhui) - else: - os.rename(filepath, path + '/' + number + c_word + houzhui) - if os.path.exists(os.getcwd() + '/' + number + c_word + '.srt'): # 字幕移动 - os.rename(os.getcwd() + '/' + number + c_word + '.srt', path + '/' + number + c_word + '.srt') - print('[+]Sub moved!') - elif os.path.exists(os.getcwd() + '/' + number + c_word + '.ssa'): - os.rename(os.getcwd() + '/' + number + c_word + '.ssa', path + '/' + number + c_word + '.ssa') - print('[+]Sub moved!') - elif os.path.exists(os.getcwd() + '/' + number + c_word + '.sub'): - os.rename(os.getcwd() + '/' + number + c_word + '.sub', path + '/' + number + c_word + '.sub') - print('[+]Sub moved!') - except FileExistsError: - print('[-]File Exists! Please check your movie!') - print('[-]move to the root folder of the program.') - return - except PermissionError: - print('[-]Error! Please run as administrator!') - return - - -def pasteFileToFolder_mode2(filepath, path, multi_part, number, part, c_word): # 文件路径,番号,后缀,要移动至的位置 - if multi_part == 1: - number += part # 这时number会被附加上CD1后缀 - houzhui = str(re.search('[.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|TS|WEBM|avi|rmvb|wmv|mov|mp4|mkv|flv|ts|webm)$', filepath).group()) - try: - if config['common']['soft_link'] == '1': - os.symlink(filepath, path + '/' + number + part + c_word + houzhui) - else: - os.rename(filepath, path + '/' + number + part + c_word + houzhui) - if os.path.exists(number + '.srt'): # 字幕移动 - os.rename(number + part + c_word + '.srt', path + '/' + number + part + c_word + '.srt') - print('[+]Sub moved!') - elif os.path.exists(number + part + c_word + '.ass'): - os.rename(number + part + c_word + '.ass', path + '/' + number + part + c_word + '.ass') - print('[+]Sub moved!') - elif os.path.exists(number + part + c_word + '.sub'): - os.rename(number + part + c_word + '.sub', path + '/' + number + part + c_word + '.sub') - print('[+]Sub moved!') - print('[!]Success') - except FileExistsError: - print('[-]File Exists! Please check your movie!') - print('[-]move to the root folder of the program.') - return - except PermissionError: - print('[-]Error! Please run as administrator!') - return - - -def copyRenameJpgToBackdrop(option, path, number, c_word): - if option == 'plex': - shutil.copy(path + '/fanart.jpg', path + '/Backdrop.jpg') - shutil.copy(path + '/poster.jpg', path + '/thumb.png') - if option == 'emby': - shutil.copy(path + '/' + number + c_word + '.jpg', path + '/Backdrop.jpg') - if option == 'kodi': - shutil.copy(path + '/' + number + c_word + '-fanart.jpg', path + '/Backdrop.jpg') - - -def get_part(filepath, failed_folder): - try: - if re.search('-CD\d+', filepath): - return re.findall('-CD\d+', filepath)[0] - if re.search('-cd\d+', filepath): - return re.findall('-cd\d+', filepath)[0] - except: - print("[-]failed!Please rename the filename again!") - moveFailedFolder(filepath, failed_folder) - return - - -def debug_mode(json_data): - try: - if config['debug_mode']['switch'] == '1': - print('[+] ---Debug info---') - for i, v in json_data.items(): - if i == 'outline': - print('[+] -', i, ' :', len(v), 'characters') - continue - if i == 'actor_photo' or i == 'year': - continue - print('[+] -', "%-11s" % i, ':', v) - print('[+] ---Debug info---') - except: - aaa = '' - - -def core_main(file_path, number_th): - # =======================================================================初始化所需变量 - multi_part = 0 - part = '' - c_word = '' - option = '' - cn_sub = '' - config_file = 'config.ini' - Config = ConfigParser() - Config.read(config_file, encoding='UTF-8') - try: - option = ReadMediaWarehouse() - except: - print('[-]Config media_warehouse read failed!') - program_mode = Config['common']['main_mode'] # 运行模式 - failed_folder = Config['common']['failed_output_folder'] # 失败输出目录 - success_folder = Config['common']['success_output_folder'] # 成功输出目录 - filepath = file_path # 影片的路径 - number = number_th - json_data = getDataFromJSON(number, filepath, failed_folder) # 定义番号 - if json_data["number"] != number: - # fix issue #119 - # the root cause is we normalize the search id - # PrintFiles() will use the normalized id from website, - # but pasteFileToFolder() still use the input raw search id - # so the solution is: use the normalized search id - number = json_data["number"] - imagecut = json_data['imagecut'] - tag = json_data['tag'] - # =======================================================================判断-C,-CD后缀 - if '-CD' in filepath or '-cd' in filepath: - multi_part = 1 - part = get_part(filepath, failed_folder) - if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath: - cn_sub = '1' - c_word = '-C' # 中文字幕影片后缀 - - CreatFailedFolder(failed_folder) # 创建输出失败目录 - debug_mode(json_data) # 调试模式检测 - path = creatFolder(success_folder, json_data['location_rule'], json_data, Config) # 创建文件夹 - # =======================================================================刮削模式 - if program_mode == '1': - if multi_part == 1: - number += part # 这时number会被附加上CD1后缀 - smallCoverCheck(path, number, imagecut, json_data['cover_small'], c_word, option, Config, filepath, failed_folder) # 检查小封面 - imageDownload(option, json_data['cover'], number, c_word, path, multi_part, Config, filepath, failed_folder) # creatFoder会返回番号路径 - cutImage(option, imagecut, path, number, c_word) # 裁剪图 - copyRenameJpgToBackdrop(option, path, number, c_word) - PrintFiles(option, path, c_word, json_data['naming_rule'], part, cn_sub, json_data, filepath, failed_folder, tag, json_data['actor_list']) # 打印文件 - pasteFileToFolder(filepath, path, number, c_word) # 移动文件 - # =======================================================================整理模式 - elif program_mode == '2': - pasteFileToFolder_mode2(filepath, path, multi_part, number, part, c_word) # 移动文件 +# -*- coding: utf-8 -*- + +import re +import os +import os.path +import shutil +from PIL import Image +import time +import json +from ADC_function import * +from configparser import ConfigParser +import argparse +# =========website======== +import fc2fans_club +import mgstage +import avsox +import javbus +import javdb +import fanza +import requests +import random + + +# =====================本地文件处理=========================== + +def escapePath(path, Config): # Remove escape literals + escapeLiterals = Config['escape']['literals'] + backslash = '\\' + for literal in escapeLiterals: + path = path.replace(backslash + literal, '') + return path + + +def moveFailedFolder(filepath, failed_folder): + print('[-]Move to Failed output folder') + shutil.move(filepath, str(os.getcwd()) + '/' + failed_folder + '/') + return + + +def CreatFailedFolder(failed_folder): + if not os.path.exists(failed_folder + '/'): # 新建failed文件夹 + try: + os.makedirs(failed_folder + '/') + except: + print("[-]failed!can not be make Failed output folder\n[-](Please run as Administrator)") + return + + +def getDataFromJSON(file_number, filepath, failed_folder): # 从JSON返回元数据 + """ + iterate through all services and fetch the data + """ + + func_mapping = { + "avsox": avsox.main, + "fc2": fc2fans_club.main, + "fanza": fanza.main, + "javdb": javdb.main, + "javbus": javbus.main, + "mgstage": mgstage.main, + } + + # default fetch order list, from the begining to the end + sources = ["javbus", "javdb", "fanza", "mgstage", "fc2", "avsox"] + + # if the input file name matches centain rules, + # move some web service to the begining of the list + if re.match(r"^\d{5,}", file_number) or ( + "HEYZO" in file_number or "heyzo" in file_number or "Heyzo" in file_number + ): + sources.insert(0, sources.pop(sources.index("avsox"))) + elif re.match(r"\d+\D+", file_number) or ( + "siro" in file_number or "SIRO" in file_number or "Siro" in file_number + ): + sources.insert(0, sources.pop(sources.index("fanza"))) + elif "fc2" in file_number or "FC2" in file_number: + sources.insert(0, sources.pop(sources.index("fc2"))) + + for source in sources: + json_data = json.loads(func_mapping[source](file_number)) + # if any service return a valid return, break + if getDataState(json_data) != 0: + break + + # ================================================网站规则添加结束================================================ + + title = json_data['title'] + actor_list = str(json_data['actor']).strip("[ ]").replace("'", '').split(',') # 字符串转列表 + release = json_data['release'] + number = json_data['number'] + studio = json_data['studio'] + source = json_data['source'] + runtime = json_data['runtime'] + outline = json_data['runtime'] + label = json_data['label'] + year = json_data['year'] + try: + cover_small = json_data['cover_small'] + except: + cover_small = '' + imagecut = json_data['imagecut'] + tag = str(json_data['tag']).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ + actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') + + + if title == '' or number == '': + print('[-]Movie Data not found!') + moveFailedFolder(filepath, failed_folder) + return + + # if imagecut == '3': + # DownloadFileWithFilename() + + # ====================处理异常字符====================== #\/:*?"<>| + title = title.replace('\\', '') + title = title.replace('/', '') + title = title.replace(':', '') + title = title.replace('*', '') + title = title.replace('?', '') + title = title.replace('"', '') + title = title.replace('<', '') + title = title.replace('>', '') + title = title.replace('|', '') + release = release.replace('/', '-') + tmpArr = cover_small.split(',') + if len(tmpArr) > 0: + cover_small = tmpArr[0].strip('\"').strip('\'') + # ====================处理异常字符 END================== #\/:*?"<>| + + naming_rule = eval(config['Name_Rule']['naming_rule']) + location_rule = eval(config['Name_Rule']['location_rule']) + if 'actor' in config['Name_Rule']['location_rule'] and len(actor) > 100: + print(config['Name_Rule']['location_rule']) + location_rule = eval(config['Name_Rule']['location_rule'].replace("actor","'多人作品'")) + if 'title' in config['Name_Rule']['location_rule'] and len(title) > 100: + location_rule = eval(config['Name_Rule']['location_rule'].replace("title",'number')) + + # 返回处理后的json_data + json_data['title'] = title + json_data['actor'] = actor + json_data['release'] = release + json_data['cover_small'] = cover_small + json_data['tag'] = tag + json_data['naming_rule'] = naming_rule + json_data['location_rule'] = location_rule + json_data['year'] = year + json_data['actor_list'] = actor_list + return json_data + + +def get_info(json_data): # 返回json里的数据 + title = json_data['title'] + studio = json_data['studio'] + year = json_data['year'] + outline = json_data['outline'] + runtime = json_data['runtime'] + director = json_data['director'] + actor_photo = json_data['actor_photo'] + release = json_data['release'] + number = json_data['number'] + cover = json_data['cover'] + website = json_data['website'] + return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website + + +def smallCoverCheck(path, number, imagecut, cover_small, c_word, option, Config, filepath, failed_folder): + if imagecut == 3: + if option == 'emby': + DownloadFileWithFilename(cover_small, '1.jpg', path, Config, filepath, failed_folder) + try: + img = Image.open(path + '/1.jpg') + except Exception: + img = Image.open('1.jpg') + w = img.width + h = img.height + img.save(path + '/' + number + c_word + '.png') + time.sleep(1) + os.remove(path + '/1.jpg') + if option == 'kodi': + DownloadFileWithFilename(cover_small, '1.jpg', path, Config, filepath, failed_folder) + try: + img = Image.open(path + '/1.jpg') + except Exception: + img = Image.open('1.jpg') + w = img.width + h = img.height + img.save(path + '/' + number + c_word + '-poster.jpg') + time.sleep(1) + os.remove(path + '/1.jpg') + if option == 'plex': + DownloadFileWithFilename(cover_small, '1.jpg', path, Config, filepath, failed_folder) + try: + img = Image.open(path + '/1.jpg') + except Exception: + img = Image.open('1.jpg') + w = img.width + h = img.height + img.save(path + '/poster.jpg') + os.remove(path + '/1.jpg') + + +def creatFolder(success_folder, location_rule, json_data, Config): # 创建文件夹 + title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website= get_info(json_data) + if len(location_rule) > 240: # 新建成功输出文件夹 + path = success_folder + '/' + location_rule.replace("'actor'", "'manypeople'", 3).replace("actor", + "'manypeople'", + 3) # path为影片+元数据所在目录 + else: + path = success_folder + '/' + location_rule + # print(path) + if not os.path.exists(path): + path = escapePath(path, Config) + try: + os.makedirs(path) + except: + path = success_folder + '/' + location_rule.replace('/[' + number + ']-' + title, "/number") + path = escapePath(path, Config) + + os.makedirs(path) + return path + + +# =====================资源下载部分=========================== +def DownloadFileWithFilename(url, filename, path, Config, filepath, failed_folder): # path = examle:photo , video.in the Project Folder! + proxy, timeout, retry_count = get_network_settings() + i = 0 + + while i < retry_count: + try: + if not proxy == '': + if not os.path.exists(path): + os.makedirs(path) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + r = requests.get(url, headers=headers, timeout=timeout, + proxies={"http": "http://" + str(proxy), "https": "https://" + str(proxy)}) + if r == '': + print('[-]Movie Data not found!') + return + with open(str(path) + "/" + filename, "wb") as code: + code.write(r.content) + return + else: + if not os.path.exists(path): + os.makedirs(path) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + r = requests.get(url, timeout=timeout, headers=headers) + if r == '': + print('[-]Movie Data not found!') + return + with open(str(path) + "/" + filename, "wb") as code: + code.write(r.content) + return + except requests.exceptions.RequestException: + i += 1 + print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) + except requests.exceptions.ConnectionError: + i += 1 + print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) + except requests.exceptions.ProxyError: + i += 1 + print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) + except requests.exceptions.ConnectTimeout: + i += 1 + print('[-]Image Download : Connect retry ' + str(i) + '/' + str(retry_count)) + print('[-]Connect Failed! Please check your Proxy or Network!') + moveFailedFolder(filepath, failed_folder) + return + + +def imageDownload(option, cover, number, c_word, path, multi_part, Config, filepath, failed_folder): # 封面是否下载成功,否则移动到failed + if option == 'emby': + if DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) == 'failed': + moveFailedFolder(filepath, failed_folder) + return + DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) + if not os.path.getsize(path + '/' + number + c_word + '.jpg') == 0: + print('[+]Image Downloaded!', path + '/' + number + c_word + '.jpg') + return + i = 1 + while i <= int(config['proxy']['retry']): + if os.path.getsize(path + '/' + number + c_word + '.jpg') == 0: + print('[!]Image Download Failed! Trying again. [' + config['proxy']['retry'] + '/3]') + DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) + i = i + 1 + continue + else: + break + if multi_part == 1: + old_name = os.path.join(path, number + c_word + '.jpg') + new_name = os.path.join(path, number + c_word + '.jpg') + os.rename(old_name, new_name) + print('[+]Image Downloaded!', path + '/' + number + c_word + '.jpg') + else: + print('[+]Image Downloaded!', path + '/' + number + c_word + '.jpg') + elif option == 'plex': + if DownloadFileWithFilename(cover, 'fanart.jpg', path, Config, filepath, failed_folder) == 'failed': + moveFailedFolder(filepath, failed_folder) + return + DownloadFileWithFilename(cover, 'fanart.jpg', path, Config, filepath, failed_folder) + if not os.path.getsize(path + '/fanart.jpg') == 0: + print('[+]Image Downloaded!', path + '/fanart.jpg') + return + i = 1 + while i <= int(config['proxy']['retry']): + if os.path.getsize(path + '/fanart.jpg') == 0: + print('[!]Image Download Failed! Trying again. [' + config['proxy']['retry'] + '/3]') + DownloadFileWithFilename(cover, 'fanart.jpg', path, Config, filepath, failed_folder) + i = i + 1 + continue + else: + break + if not os.path.getsize(path + '/' + number + c_word + '.jpg') == 0: + print('[!]Image Download Failed! Trying again.') + DownloadFileWithFilename(cover, number + c_word + '.jpg', path, Config, filepath, failed_folder) + print('[+]Image Downloaded!', path + '/fanart.jpg') + elif option == 'kodi': + if DownloadFileWithFilename(cover, number + c_word + '-fanart.jpg', path, Config, filepath, failed_folder) == 'failed': + moveFailedFolder(filepath, failed_folder) + return + DownloadFileWithFilename(cover, number + c_word + '-fanart.jpg', path, Config, filepath, failed_folder) + if not os.path.getsize(path + '/' + number + c_word + '-fanart.jpg') == 0: + print('[+]Image Downloaded!', path + '/' + number + c_word + '-fanart.jpg') + return + i = 1 + while i <= int(config['proxy']['retry']): + if os.path.getsize(path + '/' + number + c_word + '-fanart.jpg') == 0: + print('[!]Image Download Failed! Trying again. [' + config['proxy']['retry'] + '/3]') + DownloadFileWithFilename(cover, number + c_word + '-fanart.jpg', path, Config, filepath, failed_folder) + i = i + 1 + continue + else: + break + print('[+]Image Downloaded!', path + '/' + number + c_word + '-fanart.jpg') + + +def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filepath, failed_folder, tag, actor_list): + title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website = get_info(json_data) + try: + if not os.path.exists(path): + os.makedirs(path) + if option == 'plex': + with open(path + "/" + number + c_word + ".nfo", "wt", encoding='UTF-8') as code: + print('', file=code) + print("", file=code) + print(" " + naming_rule + part + "", file=code) + print(" ", file=code) + print(" ", file=code) + print(" " + studio + "+", file=code) + print(" " + year + "", file=code) + print(" " + outline + "", file=code) + print(" " + outline + "", file=code) + print(" " + str(runtime).replace(" ", "") + "", file=code) + print(" " + director + "", file=code) + print(" poster.jpg", file=code) + print(" thumb.png", file=code) + print(" fanart.jpg", file=code) + try: + for key in actor_list: + print(" ", file=code) + print(" " + key + "", file=code) + print(" ", file=code) + except: + aaaa = '' + print(" " + studio + "", file=code) + print(" ", file=code) + if cn_sub == '1': + print(" 中文字幕", file=code) + try: + for i in str(json_data['tag']).strip("[ ]").replace("'", '').replace(" ", '').split(','): + print(" " + i + "", file=code) + except: + aaaaa = '' + try: + for i in str(json_data['tag']).strip("[ ]").replace("'", '').replace(" ", '').split(','): + print(" " + i + "", file=code) + except: + aaaaaaaa = '' + if cn_sub == '1': + print(" 中文字幕", file=code) + print(" " + number + "", file=code) + print(" " + release + "", file=code) + print(" " + cover + "", file=code) + print(" " + website + "", file=code) + print("", file=code) + print("[+]Writeed! " + path + "/" + number + ".nfo") + elif option == 'emby': + with open(path + "/" + number + c_word + ".nfo", "wt", encoding='UTF-8') as code: + print('', file=code) + print("", file=code) + print(" " + naming_rule + part + "", file=code) + print(" ", file=code) + print(" ", file=code) + print(" " + studio + "+", file=code) + print(" " + year + "", file=code) + print(" " + outline + "", file=code) + print(" " + outline + "", file=code) + print(" " + str(runtime).replace(" ", "") + "", file=code) + print(" " + director + "", file=code) + print(" " + number + c_word + ".png", file=code) + print(" " + number + c_word + ".png", file=code) + print(" " + number + c_word + '.jpg' + "", file=code) + try: + for key in actor_list: + print(" ", file=code) + print(" " + key + "", file=code) + print(" ", file=code) + except: + aaaa = '' + print(" " + studio + "", file=code) + print(" ", file=code) + if cn_sub == '1': + print(" 中文字幕", file=code) + try: + for i in tag: + print(" " + i + "", file=code) + except: + aaaaa = '' + try: + for i in tag: + print(" " + i + "", file=code) + except: + aaaaaaaa = '' + if cn_sub == '1': + print(" 中文字幕", file=code) + print(" " + number + "", file=code) + print(" " + release + "", file=code) + print(" " + cover + "", file=code) + print(" " + website + "", file=code) + print("", file=code) + print("[+]Writeed! " + path + "/" + number + c_word + ".nfo") + elif option == 'kodi': + with open(path + "/" + number + c_word + ".nfo", "wt", encoding='UTF-8') as code: + print('', file=code) + print("", file=code) + print(" " + naming_rule + part + "", file=code) + print(" ", file=code) + print(" ", file=code) + print(" " + studio + "+", file=code) + print(" " + year + "", file=code) + print(" " + outline + "", file=code) + print(" " + outline + "", file=code) + print(" " + str(runtime).replace(" ", "") + "", file=code) + print(" " + director + "", file=code) + print(" " + number + c_word + "-poster.jpg", file=code) + print(" " + number + c_word + '-fanart.jpg' + "", file=code) + try: + for key in actor_list: + print(" ", file=code) + print(" " + key + "", file=code) + print(" ", file=code) + except: + aaaa = '' + print(" " + studio + "", file=code) + print(" ", file=code) + if cn_sub == '1': + print(" 中文字幕", file=code) + try: + for i in tag: + print(" " + i + "", file=code) + except: + aaaaa = '' + try: + for i in tag: + print(" " + i + "", file=code) + except: + aaaaaaaa = '' + if cn_sub == '1': + print(" 中文字幕", file=code) + print(" " + number + "", file=code) + print(" " + release + "", file=code) + print(" " + cover + "", file=code) + print(" " + website + "", file=code) + print("", file=code) + print("[+]Writeed! " + path + "/" + number + c_word + ".nfo") + except IOError as e: + print("[-]Write Failed!") + print(e) + moveFailedFolder(filepath, failed_folder) + return + except Exception as e1: + print(e1) + print("[-]Write Failed!") + moveFailedFolder(filepath, failed_folder) + return + + +def cutImage(option, imagecut, path, number, c_word): + if option == 'plex': + if imagecut == 1: + try: + img = Image.open(path + '/fanart.jpg') + imgSize = img.size + w = img.width + h = img.height + img2 = img.crop((w / 1.9, 0, w, h)) + img2.save(path + '/poster.jpg') + except: + print('[-]Cover cut failed!') + elif imagecut == 0: + img = Image.open(path + '/fanart.jpg') + w = img.width + h = img.height + img.save(path + '/poster.jpg') + elif option == 'emby': + if imagecut == 1: + try: + img = Image.open(path + '/' + number + c_word + '.jpg') + imgSize = img.size + w = img.width + h = img.height + img2 = img.crop((w / 1.9, 0, w, h)) + img2.save(path + '/' + number + c_word + '.png') + except: + print('[-]Cover cut failed!') + elif imagecut == 0: + img = Image.open(path + '/' + number + c_word + '.jpg') + w = img.width + h = img.height + img.save(path + '/' + number + c_word + '.png') + elif option == 'kodi': + if imagecut == 1: + try: + img = Image.open(path + '/' + number + c_word + '-fanart.jpg') + imgSize = img.size + w = img.width + h = img.height + img2 = img.crop((w / 1.9, 0, w, h)) + img2.save(path + '/' + number + c_word + '-poster.jpg') + except: + print('[-]Cover cut failed!') + elif imagecut == 0: + img = Image.open(path + '/' + number + c_word + '-fanart.jpg') + w = img.width + h = img.height + try: + img = img.convert('RGB') + img.save(path + '/' + number + c_word + '-poster.jpg') + except: + img = img.convert('RGB') + img.save(path + '/' + number + c_word + '-poster.jpg') + + +def pasteFileToFolder(filepath, path, number, c_word): # 文件路径,番号,后缀,要移动至的位置 + houzhui = str(re.search('[.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|TS|WEBM|avi|rmvb|wmv|mov|mp4|mkv|flv|ts|webm)$', filepath).group()) + try: + if config['common']['soft_link'] == '1': # 如果soft_link=1 使用软链接 + os.symlink(filepath, path + '/' + number + c_word + houzhui) + else: + os.rename(filepath, path + '/' + number + c_word + houzhui) + if os.path.exists(os.getcwd() + '/' + number + c_word + '.srt'): # 字幕移动 + os.rename(os.getcwd() + '/' + number + c_word + '.srt', path + '/' + number + c_word + '.srt') + print('[+]Sub moved!') + elif os.path.exists(os.getcwd() + '/' + number + c_word + '.ssa'): + os.rename(os.getcwd() + '/' + number + c_word + '.ssa', path + '/' + number + c_word + '.ssa') + print('[+]Sub moved!') + elif os.path.exists(os.getcwd() + '/' + number + c_word + '.sub'): + os.rename(os.getcwd() + '/' + number + c_word + '.sub', path + '/' + number + c_word + '.sub') + print('[+]Sub moved!') + except FileExistsError: + print('[-]File Exists! Please check your movie!') + print('[-]move to the root folder of the program.') + return + except PermissionError: + print('[-]Error! Please run as administrator!') + return + + +def pasteFileToFolder_mode2(filepath, path, multi_part, number, part, c_word): # 文件路径,番号,后缀,要移动至的位置 + if multi_part == 1: + number += part # 这时number会被附加上CD1后缀 + houzhui = str(re.search('[.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|TS|WEBM|avi|rmvb|wmv|mov|mp4|mkv|flv|ts|webm)$', filepath).group()) + try: + if config['common']['soft_link'] == '1': + os.symlink(filepath, path + '/' + number + part + c_word + houzhui) + else: + os.rename(filepath, path + '/' + number + part + c_word + houzhui) + if os.path.exists(number + '.srt'): # 字幕移动 + os.rename(number + part + c_word + '.srt', path + '/' + number + part + c_word + '.srt') + print('[+]Sub moved!') + elif os.path.exists(number + part + c_word + '.ass'): + os.rename(number + part + c_word + '.ass', path + '/' + number + part + c_word + '.ass') + print('[+]Sub moved!') + elif os.path.exists(number + part + c_word + '.sub'): + os.rename(number + part + c_word + '.sub', path + '/' + number + part + c_word + '.sub') + print('[+]Sub moved!') + print('[!]Success') + except FileExistsError: + print('[-]File Exists! Please check your movie!') + print('[-]move to the root folder of the program.') + return + except PermissionError: + print('[-]Error! Please run as administrator!') + return + + +def copyRenameJpgToBackdrop(option, path, number, c_word): + if option == 'plex': + shutil.copy(path + '/fanart.jpg', path + '/Backdrop.jpg') + shutil.copy(path + '/poster.jpg', path + '/thumb.png') + if option == 'emby': + shutil.copy(path + '/' + number + c_word + '.jpg', path + '/Backdrop.jpg') + if option == 'kodi': + shutil.copy(path + '/' + number + c_word + '-fanart.jpg', path + '/Backdrop.jpg') + + +def get_part(filepath, failed_folder): + try: + if re.search('-CD\d+', filepath): + return re.findall('-CD\d+', filepath)[0] + if re.search('-cd\d+', filepath): + return re.findall('-cd\d+', filepath)[0] + except: + print("[-]failed!Please rename the filename again!") + moveFailedFolder(filepath, failed_folder) + return + + +def debug_mode(json_data): + try: + if config['debug_mode']['switch'] == '1': + print('[+] ---Debug info---') + for i, v in json_data.items(): + if i == 'outline': + print('[+] -', i, ' :', len(v), 'characters') + continue + if i == 'actor_photo' or i == 'year': + continue + print('[+] -', "%-11s" % i, ':', v) + print('[+] ---Debug info---') + except: + aaa = '' + + +def core_main(file_path, number_th): + # =======================================================================初始化所需变量 + multi_part = 0 + part = '' + c_word = '' + option = '' + cn_sub = '' + config_file = 'config.ini' + Config = ConfigParser() + Config.read(config_file, encoding='UTF-8') + try: + option = ReadMediaWarehouse() + except: + print('[-]Config media_warehouse read failed!') + program_mode = Config['common']['main_mode'] # 运行模式 + failed_folder = Config['common']['failed_output_folder'] # 失败输出目录 + success_folder = Config['common']['success_output_folder'] # 成功输出目录 + filepath = file_path # 影片的路径 + number = number_th + json_data = getDataFromJSON(number, filepath, failed_folder) # 定义番号 + if json_data["number"] != number: + # fix issue #119 + # the root cause is we normalize the search id + # PrintFiles() will use the normalized id from website, + # but pasteFileToFolder() still use the input raw search id + # so the solution is: use the normalized search id + number = json_data["number"] + imagecut = json_data['imagecut'] + tag = json_data['tag'] + # =======================================================================判断-C,-CD后缀 + if '-CD' in filepath or '-cd' in filepath: + multi_part = 1 + part = get_part(filepath, failed_folder) + if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath: + cn_sub = '1' + c_word = '-C' # 中文字幕影片后缀 + + CreatFailedFolder(failed_folder) # 创建输出失败目录 + debug_mode(json_data) # 调试模式检测 + path = creatFolder(success_folder, json_data['location_rule'], json_data, Config) # 创建文件夹 + # =======================================================================刮削模式 + if program_mode == '1': + if multi_part == 1: + number += part # 这时number会被附加上CD1后缀 + smallCoverCheck(path, number, imagecut, json_data['cover_small'], c_word, option, Config, filepath, failed_folder) # 检查小封面 + imageDownload(option, json_data['cover'], number, c_word, path, multi_part, Config, filepath, failed_folder) # creatFoder会返回番号路径 + cutImage(option, imagecut, path, number, c_word) # 裁剪图 + copyRenameJpgToBackdrop(option, path, number, c_word) + PrintFiles(option, path, c_word, json_data['naming_rule'], part, cn_sub, json_data, filepath, failed_folder, tag, json_data['actor_list']) # 打印文件 + pasteFileToFolder(filepath, path, number, c_word) # 移动文件 + # =======================================================================整理模式 + elif program_mode == '2': + pasteFileToFolder_mode2(filepath, path, multi_part, number, part, c_word) # 移动文件 diff --git a/fanza.py b/fanza.py index 87c8be0..72632dc 100644 --- a/fanza.py +++ b/fanza.py @@ -1,229 +1,229 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -import json -import re - -from lxml import etree - -from ADC_function import * - -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - - -def getTitle(text): - html = etree.fromstring(text, etree.HTMLParser()) - result = html.xpath('//*[@id="title"]/text()')[0] - return result - - -def getActor(text): - # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(text, etree.HTMLParser()) - result = ( - str( - html.xpath( - "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" - ) - ) - .strip(" ['']") - .replace("', '", ",") - ) - return result - - -def getStudio(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'メーカー')]/following-sibling::td/text()" - )[0] - return result - - -def getRuntime(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0] - return re.search(r"\d+", str(result)).group() - - -def getLabel(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" - )[0] - return result - - -def getNum(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'品番:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'品番:')]/following-sibling::td/text()" - )[0] - return result - - -def getYear(getRelease): - try: - result = str(re.search(r"\d{4}", getRelease).group()) - return result - except: - return getRelease - - -def getRelease(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" - )[0].lstrip("\n") - except: - result = html.xpath( - "//td[contains(text(),'発売日:')]/following-sibling::td/text()" - )[0].lstrip("\n") - return result - - -def getTag(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" - ) - except: - result = html.xpath( - "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" - ) - return result - - -def getCover(text, number): - html = etree.fromstring(text, etree.HTMLParser()) - cover_number = number - try: - result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] - except: - # sometimes fanza modify _ to \u0005f for image id - if "_" in cover_number: - cover_number = cover_number.replace("_", r"\u005f") - try: - result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] - except: - # (TODO) handle more edge case - # print(html) - # raise exception here, same behavior as before - # people's major requirement is fetching the picture - raise ValueError("can not find image") - return result - - -def getDirector(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'監督:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'監督:')]/following-sibling::td/text()" - )[0] - return result - - -def getOutline(text): - html = etree.fromstring(text, etree.HTMLParser()) - try: - result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace( - "\n", "" - ) - if result == "": - result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( - "\n", "" - ) - except: - # (TODO) handle more edge case - # print(html) - return "" - return result - - -def main(number): - # fanza allow letter + number + underscore, normalize the input here - # @note: I only find the usage of underscore as h_test123456789 - fanza_search_number = number - # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix - if fanza_search_number.startswith("h-"): - fanza_search_number = fanza_search_number.replace("h-", "h_") - - fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() - - fanza_urls = [ - "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", - ] - chosen_url = "" - for url in fanza_urls: - chosen_url = url + fanza_search_number - htmlcode = get_html(chosen_url) - if "404 Not Found" not in htmlcode: - break - if "404 Not Found" in htmlcode: - return json.dumps({"title": "",}) - try: - # for some old page, the input number does not match the page - # for example, the url will be cid=test012 - # but the hinban on the page is test00012 - # so get the hinban first, and then pass it to following functions - fanza_hinban = getNum(htmlcode) - data = { - "title": getTitle(htmlcode).strip(getActor(htmlcode)), - "studio": getStudio(htmlcode), - "outline": getOutline(htmlcode), - "runtime": getRuntime(htmlcode), - "director": getDirector(htmlcode) if "anime" not in chosen_url else "", - "actor": getActor(htmlcode) if "anime" not in chosen_url else "", - "release": getRelease(htmlcode), - "number": fanza_hinban, - "cover": getCover(htmlcode, fanza_hinban), - "imagecut": 1, - "tag": getTag(htmlcode), - "label": getLabel(htmlcode), - "year": getYear( - getRelease(htmlcode) - ), # str(re.search('\d{4}',getRelease(a)).group()), - "actor_photo": "", - "website": chosen_url, - "source": "fanza.py", - } - except: - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) # .encode('UTF-8') - return js - - -if __name__ == "__main__": - # print(main("DV-1562")) - # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") - # print(main("ipx292")) - pass +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import json +import re + +from lxml import etree + +from ADC_function import * + +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + + +def getTitle(text): + html = etree.fromstring(text, etree.HTMLParser()) + result = html.xpath('//*[@id="title"]/text()')[0] + return result + + +def getActor(text): + # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(text, etree.HTMLParser()) + result = ( + str( + html.xpath( + "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" + ) + ) + .strip(" ['']") + .replace("', '", ",") + ) + return result + + +def getStudio(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'メーカー')]/following-sibling::td/text()" + )[0] + return result + + +def getRuntime(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0] + return re.search(r"\d+", str(result)).group() + + +def getLabel(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" + )[0] + return result + + +def getNum(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'品番:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'品番:')]/following-sibling::td/text()" + )[0] + return result + + +def getYear(getRelease): + try: + result = str(re.search(r"\d{4}", getRelease).group()) + return result + except: + return getRelease + + +def getRelease(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" + )[0].lstrip("\n") + except: + result = html.xpath( + "//td[contains(text(),'発売日:')]/following-sibling::td/text()" + )[0].lstrip("\n") + return result + + +def getTag(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" + ) + except: + result = html.xpath( + "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" + ) + return result + + +def getCover(text, number): + html = etree.fromstring(text, etree.HTMLParser()) + cover_number = number + try: + result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # sometimes fanza modify _ to \u0005f for image id + if "_" in cover_number: + cover_number = cover_number.replace("_", r"\u005f") + try: + result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # (TODO) handle more edge case + # print(html) + # raise exception here, same behavior as before + # people's major requirement is fetching the picture + raise ValueError("can not find image") + return result + + +def getDirector(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'監督:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'監督:')]/following-sibling::td/text()" + )[0] + return result + + +def getOutline(text): + html = etree.fromstring(text, etree.HTMLParser()) + try: + result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace( + "\n", "" + ) + if result == "": + result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( + "\n", "" + ) + except: + # (TODO) handle more edge case + # print(html) + return "" + return result + + +def main(number): + # fanza allow letter + number + underscore, normalize the input here + # @note: I only find the usage of underscore as h_test123456789 + fanza_search_number = number + # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix + if fanza_search_number.startswith("h-"): + fanza_search_number = fanza_search_number.replace("h-", "h_") + + fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() + + fanza_urls = [ + "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", + ] + chosen_url = "" + for url in fanza_urls: + chosen_url = url + fanza_search_number + htmlcode = get_html(chosen_url) + if "404 Not Found" not in htmlcode: + break + if "404 Not Found" in htmlcode: + return json.dumps({"title": "",}) + try: + # for some old page, the input number does not match the page + # for example, the url will be cid=test012 + # but the hinban on the page is test00012 + # so get the hinban first, and then pass it to following functions + fanza_hinban = getNum(htmlcode) + data = { + "title": getTitle(htmlcode).strip(getActor(htmlcode)), + "studio": getStudio(htmlcode), + "outline": getOutline(htmlcode), + "runtime": getRuntime(htmlcode), + "director": getDirector(htmlcode) if "anime" not in chosen_url else "", + "actor": getActor(htmlcode) if "anime" not in chosen_url else "", + "release": getRelease(htmlcode), + "number": fanza_hinban, + "cover": getCover(htmlcode, fanza_hinban), + "imagecut": 1, + "tag": getTag(htmlcode), + "label": getLabel(htmlcode), + "year": getYear( + getRelease(htmlcode) + ), # str(re.search('\d{4}',getRelease(a)).group()), + "actor_photo": "", + "website": chosen_url, + "source": "fanza.py", + } + except: + data = { + "title": "", + } + js = json.dumps( + data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") + ) # .encode('UTF-8') + return js + + +if __name__ == "__main__": + # print(main("DV-1562")) + # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") + # print(main("ipx292")) + pass diff --git a/fc2fans_club.py b/fc2fans_club.py index 3215e49..9dfeb24 100755 --- a/fc2fans_club.py +++ b/fc2fans_club.py @@ -1,162 +1,162 @@ -import re -from lxml import etree#need install -import json -import ADC_function -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getTitle(htmlcode): #获取厂商 - #print(htmlcode) - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']") - result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1) - #print(result2) - return result2 -def getActor(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']") - return result - except: - return '' -def getStudio(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") - return result -def getNum(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - #print(result) - return result -def getRelease(htmlcode2): # - #a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') - html=etree.fromstring(htmlcode2,etree.HTMLParser()) - result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") - return result -def getCover(htmlcode,number,htmlcode2): #获取厂商 # - #a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']") - if result == '': - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']") - return 'https://fc2club.com' + result2 - return 'http:' + result -def getOutline(htmlcode2): #获取番号 # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') - return result -def getTag(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()')) - return result.strip(" ['']").replace("'",'').replace(' ','') -def getYear(release): - try: - result = re.search('\d{4}',release).group() - return result - except: - return '' - -def getTitle_fc2com(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0] - return result -def getActor_fc2com(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] - return result - except: - return '' -def getStudio_fc2com(htmlcode): #获取厂商 - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']") - return result - except: - return '' -def getNum_fc2com(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getRelease_fc2com(htmlcode2): # - html=etree.fromstring(htmlcode2,etree.HTMLParser()) - result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") - return result -def getCover_fc2com(htmlcode2): #获取厂商 # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']") - return 'http:' + result -def getOutline_fc2com(htmlcode2): #获取番号 # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') - return result -def getTag_fc2com(number): #获取番号 - htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape')) - result = re.findall('"tag":"(.*?)"', htmlcode) - return result -def getYear_fc2com(release): - try: - result = re.search('\d{4}',release).group() - return result - except: - return '' - -def main(number): - try: - htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/') - htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html') - actor = getActor(htmlcode) - if getActor(htmlcode) == '': - actor = 'FC2系列' - dic = { - 'title': getTitle(htmlcode), - 'studio': getStudio(htmlcode), - 'year': '',#str(re.search('\d{4}',getRelease(number)).group()), - 'outline': '',#getOutline(htmlcode2), - 'runtime': getYear(getRelease(htmlcode)), - 'director': getStudio(htmlcode), - 'actor': actor, - 'release': getRelease(number), - 'number': 'FC2-'+number, - 'label': '', - 'cover': getCover(htmlcode,number,htmlcode2), - 'imagecut': 0, - 'tag': getTag(htmlcode), - 'actor_photo':'', - 'website': 'https://fc2club.com//html/FC2-' + number + '.html', - 'source':'https://fc2club.com//html/FC2-' + number + '.html', - } - if dic['title'] == '': - htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'}) - actor = getActor(htmlcode) - if getActor(htmlcode) == '': - actor = 'FC2系列' - dic = { - 'title': getTitle_fc2com(htmlcode2), - 'studio': getStudio_fc2com(htmlcode2), - 'year': '', # str(re.search('\d{4}',getRelease(number)).group()), - 'outline': getOutline_fc2com(htmlcode2), - 'runtime': getYear_fc2com(getRelease(htmlcode2)), - 'director': getStudio_fc2com(htmlcode2), - 'actor': actor, - 'release': getRelease_fc2com(number), - 'number': 'FC2-' + number, - 'cover': getCover_fc2com(htmlcode2), - 'imagecut': 0, - 'tag': getTag_fc2com(number), - 'label': '', - 'actor_photo': '', - 'website': 'http://adult.contents.fc2.com/article/' + number + '/', - 'source': 'http://adult.contents.fc2.com/article/' + number + '/', - } - except Exception as e: - # (TODO) better handle this - # print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') - return js - - -#print(main('1252953')) +import re +from lxml import etree#need install +import json +import ADC_function +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getTitle(htmlcode): #获取厂商 + #print(htmlcode) + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']") + result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1) + #print(result2) + return result2 +def getActor(htmlcode): + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']") + return result + except: + return '' +def getStudio(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") + return result +def getNum(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + #print(result) + return result +def getRelease(htmlcode2): # + #a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') + html=etree.fromstring(htmlcode2,etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") + return result +def getCover(htmlcode,number,htmlcode2): #获取厂商 # + #a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']") + if result == '': + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']") + return 'https://fc2club.com' + result2 + return 'http:' + result +def getOutline(htmlcode2): #获取番号 # + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') + return result +def getTag(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()')) + return result.strip(" ['']").replace("'",'').replace(' ','') +def getYear(release): + try: + result = re.search('\d{4}',release).group() + return result + except: + return '' + +def getTitle_fc2com(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0] + return result +def getActor_fc2com(htmlcode): + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] + return result + except: + return '' +def getStudio_fc2com(htmlcode): #获取厂商 + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']") + return result + except: + return '' +def getNum_fc2com(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + return result +def getRelease_fc2com(htmlcode2): # + html=etree.fromstring(htmlcode2,etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") + return result +def getCover_fc2com(htmlcode2): #获取厂商 # + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']") + return 'http:' + result +def getOutline_fc2com(htmlcode2): #获取番号 # + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') + return result +def getTag_fc2com(number): #获取番号 + htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape')) + result = re.findall('"tag":"(.*?)"', htmlcode) + return result +def getYear_fc2com(release): + try: + result = re.search('\d{4}',release).group() + return result + except: + return '' + +def main(number): + try: + htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/') + htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html') + actor = getActor(htmlcode) + if getActor(htmlcode) == '': + actor = 'FC2系列' + dic = { + 'title': getTitle(htmlcode), + 'studio': getStudio(htmlcode), + 'year': '',#str(re.search('\d{4}',getRelease(number)).group()), + 'outline': '',#getOutline(htmlcode2), + 'runtime': getYear(getRelease(htmlcode)), + 'director': getStudio(htmlcode), + 'actor': actor, + 'release': getRelease(number), + 'number': 'FC2-'+number, + 'label': '', + 'cover': getCover(htmlcode,number,htmlcode2), + 'imagecut': 0, + 'tag': getTag(htmlcode), + 'actor_photo':'', + 'website': 'https://fc2club.com//html/FC2-' + number + '.html', + 'source':'https://fc2club.com//html/FC2-' + number + '.html', + } + if dic['title'] == '': + htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'}) + actor = getActor(htmlcode) + if getActor(htmlcode) == '': + actor = 'FC2系列' + dic = { + 'title': getTitle_fc2com(htmlcode2), + 'studio': getStudio_fc2com(htmlcode2), + 'year': '', # str(re.search('\d{4}',getRelease(number)).group()), + 'outline': getOutline_fc2com(htmlcode2), + 'runtime': getYear_fc2com(getRelease(htmlcode2)), + 'director': getStudio_fc2com(htmlcode2), + 'actor': actor, + 'release': getRelease_fc2com(number), + 'number': 'FC2-' + number, + 'cover': getCover_fc2com(htmlcode2), + 'imagecut': 0, + 'tag': getTag_fc2com(number), + 'label': '', + 'actor_photo': '', + 'website': 'http://adult.contents.fc2.com/article/' + number + '/', + 'source': 'http://adult.contents.fc2.com/article/' + number + '/', + } + except Exception as e: + # (TODO) better handle this + # print(e) + dic = {"title": ""} + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') + return js + + +#print(main('1252953')) diff --git a/javbus.py b/javbus.py index aa18d2a..83d61f8 100755 --- a/javbus.py +++ b/javbus.py @@ -1,138 +1,138 @@ -import re -from pyquery import PyQuery as pq#need install -from lxml import etree#need install -from bs4 import BeautifulSoup#need install -import json -from ADC_function import * - -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'star-name'}) - d={} - for i in a: - l=i.a['href'] - t=i.get_text() - html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") - p2={t:p} - d.update(p2) - return d -def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - title=str(doc('div.container h3').text()).replace(' ','-') - try: - title2 = re.sub('n\d+-','',title) - return title2 - except: - return title -def getStudio(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - return result -def getYear(htmlcode): #获取年份 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) - image = doc('a.bigImage') - return image.attr('href') -def getRelease(htmlcode): #获取出版日期 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(htmlcode): #获取分钟 - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find(text=re.compile('分鐘')) - return a -def getActor(htmlcode): #获取女优 - b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.get_text()) - return b -def getNum(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - return result -def getOutline(htmlcode): #获取演员 - doc = pq(htmlcode) - result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) - return result -def getSerise(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - return result -def getTag(htmlcode): # 获取演员 - tag = [] - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'genre'}) - for i in a: - if 'onmouseout' in str(i): - continue - tag.append(i.get_text()) - return tag - - -def main(number): - try: - htmlcode = get_html('https://www.javbus.com/' + number) - try: - dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) - except: - dww_htmlcode = '' - dic = { - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), - 'studio': getStudio(htmlcode), - 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), - 'outline': getOutline(dww_htmlcode), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), - 'imagecut': 1, - 'tag': getTag(htmlcode), - 'label': getSerise(htmlcode), - 'actor_photo': getActorPhoto(htmlcode), - 'website': 'https://www.javbus.com/' + number, - 'source' : 'javbus.py', - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - except: - return main_uncensored(number) - -def main_uncensored(number): - htmlcode = get_html('https://www.javbus.com/' + number) - dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) - if getTitle(htmlcode) == '': - htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) - dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) - dic = { - 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), - 'studio': getStudio(htmlcode), - 'year': getYear(htmlcode), - 'outline': getOutline(dww_htmlcode), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), - 'tag': getTag(htmlcode), - 'label': getSerise(htmlcode), - 'imagecut': 0, - 'actor_photo': '', - 'website': 'https://www.javbus.com/' + number, - 'source': 'javbus.py', - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - +import re +from pyquery import PyQuery as pq#need install +from lxml import etree#need install +from bs4 import BeautifulSoup#need install +import json +from ADC_function import * + +def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find_all(attrs={'class': 'star-name'}) + d={} + for i in a: + l=i.a['href'] + t=i.get_text() + html = etree.fromstring(get_html(l), etree.HTMLParser()) + p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") + p2={t:p} + d.update(p2) + return d +def getTitle(htmlcode): #获取标题 + doc = pq(htmlcode) + title=str(doc('div.container h3').text()).replace(' ','-') + try: + title2 = re.sub('n\d+-','',title) + return title2 + except: + return title +def getStudio(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") + return result +def getYear(htmlcode): #获取年份 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") + return result +def getCover(htmlcode): #获取封面链接 + doc = pq(htmlcode) + image = doc('a.bigImage') + return image.attr('href') +def getRelease(htmlcode): #获取出版日期 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") + return result +def getRuntime(htmlcode): #获取分钟 + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find(text=re.compile('分鐘')) + return a +def getActor(htmlcode): #获取女优 + b=[] + soup=BeautifulSoup(htmlcode,'lxml') + a=soup.find_all(attrs={'class':'star-name'}) + for i in a: + b.append(i.get_text()) + return b +def getNum(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + return result +def getDirector(htmlcode): #获取导演 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + return result +def getOutline(htmlcode): #获取演员 + doc = pq(htmlcode) + result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) + return result +def getSerise(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") + return result +def getTag(htmlcode): # 获取演员 + tag = [] + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find_all(attrs={'class': 'genre'}) + for i in a: + if 'onmouseout' in str(i): + continue + tag.append(i.get_text()) + return tag + + +def main(number): + try: + htmlcode = get_html('https://www.javbus.com/' + number) + try: + dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) + except: + dww_htmlcode = '' + dic = { + 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), + 'studio': getStudio(htmlcode), + 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), + 'outline': getOutline(dww_htmlcode), + 'runtime': getRuntime(htmlcode), + 'director': getDirector(htmlcode), + 'actor': getActor(htmlcode), + 'release': getRelease(htmlcode), + 'number': getNum(htmlcode), + 'cover': getCover(htmlcode), + 'imagecut': 1, + 'tag': getTag(htmlcode), + 'label': getSerise(htmlcode), + 'actor_photo': getActorPhoto(htmlcode), + 'website': 'https://www.javbus.com/' + number, + 'source' : 'javbus.py', + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + except: + return main_uncensored(number) + +def main_uncensored(number): + htmlcode = get_html('https://www.javbus.com/' + number) + dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) + if getTitle(htmlcode) == '': + htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) + dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) + dic = { + 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), + 'studio': getStudio(htmlcode), + 'year': getYear(htmlcode), + 'outline': getOutline(dww_htmlcode), + 'runtime': getRuntime(htmlcode), + 'director': getDirector(htmlcode), + 'actor': getActor(htmlcode), + 'release': getRelease(htmlcode), + 'number': getNum(htmlcode), + 'cover': getCover(htmlcode), + 'tag': getTag(htmlcode), + 'label': getSerise(htmlcode), + 'imagecut': 0, + 'actor_photo': '', + 'website': 'https://www.javbus.com/' + number, + 'source': 'javbus.py', + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + diff --git a/javdb.py b/javdb.py index 727c992..180602a 100755 --- a/javdb.py +++ b/javdb.py @@ -1,123 +1,123 @@ -import re -from lxml import etree -import json -from bs4 import BeautifulSoup -from ADC_function import * -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getTitle(a): - html = etree.fromstring(a, etree.HTMLParser()) - result = html.xpath("/html/body/section/div/h2/strong/text()")[0] - return result -def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') -def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img - a = actor.split(',') - d={} - for i in a: - p={i:''} - d.update(p) - return d -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) - result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result2 + result1).strip('+') -def getYear(getRelease): - try: - result = str(re.search('\d{4}', getRelease).group()) - return result - except: - return getRelease -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+') -def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',') -def getCover_small(a, index=0): - # same issue mentioned below, - # javdb sometime returns multiple results - # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] - if not 'https' in result: - result = 'https:' + result - return result -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath("//div[@class='column column-video-cover']/a/img/@src")).strip(" ['']") - return result -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") - return result -def main(number): - try: - number = number.upper() - query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') - html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - # javdb sometime returns multiple results, - # and the first elememt maybe not the one we are looking for - # iterate all candidates and find the match one - urls = html.xpath('//*[@id="videos"]/div/div/a/@href') - ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') - correct_url = urls[ids.index(number)] - detail_page = get_html('https://javdb.com' + correct_url) - dic = { - 'actor': getActor(detail_page), - 'title': getTitle(detail_page), - 'studio': getStudio(detail_page), - 'outline': getOutline(detail_page), - 'runtime': getRuntime(detail_page), - 'director': getDirector(detail_page), - 'release': getRelease(detail_page), - 'number': getNum(detail_page), - 'cover': getCover(detail_page), - 'cover_small': getCover_small(query_result, index=ids.index(number)), - 'imagecut': 3, - 'tag': getTag(detail_page), - 'label': getLabel(detail_page), - 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(getActor(detail_page)), - 'website': 'https://javdb.com' + correct_url, - 'source': 'javdb.py', - } - except Exception as e: - # print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -# main('DV-1562') -# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") -#print(main('ipx-292')) +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getTitle(a): + html = etree.fromstring(a, etree.HTMLParser()) + result = html.xpath("/html/body/section/div/h2/strong/text()")[0] + return result +def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') +def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img + a = actor.split(',') + d={} + for i in a: + p={i:''} + d.update(p) + return d +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').rstrip('mi') +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) + result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result2 + result1).strip('+') +def getYear(getRelease): + try: + result = str(re.search('\d{4}', getRelease).group()) + return result + except: + return getRelease +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+') +def getTag(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',') +def getCover_small(a, index=0): + # same issue mentioned below, + # javdb sometime returns multiple results + # DO NOT just get the firt one, get the one with correct index number + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] + if not 'https' in result: + result = 'https:' + result + return result +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath("//div[@class='column column-video-cover']/a/img/@src")).strip(" ['']") + return result +def getDirector(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def getOutline(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") + return result +def main(number): + try: + number = number.upper() + query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') + html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + # javdb sometime returns multiple results, + # and the first elememt maybe not the one we are looking for + # iterate all candidates and find the match one + urls = html.xpath('//*[@id="videos"]/div/div/a/@href') + ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') + correct_url = urls[ids.index(number)] + detail_page = get_html('https://javdb.com' + correct_url) + dic = { + 'actor': getActor(detail_page), + 'title': getTitle(detail_page), + 'studio': getStudio(detail_page), + 'outline': getOutline(detail_page), + 'runtime': getRuntime(detail_page), + 'director': getDirector(detail_page), + 'release': getRelease(detail_page), + 'number': getNum(detail_page), + 'cover': getCover(detail_page), + 'cover_small': getCover_small(query_result, index=ids.index(number)), + 'imagecut': 3, + 'tag': getTag(detail_page), + 'label': getLabel(detail_page), + 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': getActorPhoto(getActor(detail_page)), + 'website': 'https://javdb.com' + correct_url, + 'source': 'javdb.py', + } + except Exception as e: + # print(e) + dic = {"title": ""} + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + +# main('DV-1562') +# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") +#print(main('ipx-292')) diff --git a/mgstage.py b/mgstage.py index 8880f96..76665ab 100755 --- a/mgstage.py +++ b/mgstage.py @@ -1,111 +1,111 @@ -import re -from lxml import etree -import json -from bs4 import BeautifulSoup -from ADC_function import * -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getTitle(a): - try: - html = etree.fromstring(a, etree.HTMLParser()) - result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") - return result.replace('/', ',') - except: - return '' -def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() - result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') - return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() - result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') - return str(result1+result2).strip('+').replace("', '",'').replace('"','') -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') - return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace("', '",'').replace('"','') -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+') -def getYear(getRelease): - try: - result = str(re.search('\d{4}',getRelease).group()) - return result - except: - return getRelease -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+') -def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") - # /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src - return result -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace("', '",'').replace('"','') -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//p/text()')).strip(" ['']") - return result -def main(number2): - number=number2.upper() - htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) - soup = BeautifulSoup(htmlcode, 'lxml') - a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') - b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') - print(b) - dic = { - 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), - 'studio': getStudio(a), - 'outline': getOutline(b), - 'runtime': getRuntime(a), - 'director': getDirector(a), - 'actor': getActor(a), - 'release': getRelease(a), - 'number': getNum(a), - 'cover': getCover(htmlcode), - 'imagecut': 0, - 'tag': getTag(a), - 'label':getLabel(a), - 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': '', - 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', - 'source': 'mgstage.py', - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - #print(htmlcode) - -print(main('SIRO-3607')) +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getTitle(a): + try: + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") + return result.replace('/', ',') + except: + return '' +def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() + result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() + result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1+result2).strip('+').replace("', '",'').replace('"','') +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1 + result2).strip('+').rstrip('mi') +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '",'').replace('"','') +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+') +def getYear(getRelease): + try: + result = str(re.search('\d{4}',getRelease).group()) + return result + except: + return getRelease +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+') +def getTag(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") + # /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src + return result +def getDirector(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '",'').replace('"','') +def getOutline(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//p/text()')).strip(" ['']") + return result +def main(number2): + number=number2.upper() + htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) + soup = BeautifulSoup(htmlcode, 'lxml') + a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') + b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') + print(b) + dic = { + 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), + 'studio': getStudio(a), + 'outline': getOutline(b), + 'runtime': getRuntime(a), + 'director': getDirector(a), + 'actor': getActor(a), + 'release': getRelease(a), + 'number': getNum(a), + 'cover': getCover(htmlcode), + 'imagecut': 0, + 'tag': getTag(a), + 'label':getLabel(a), + 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': '', + 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', + 'source': 'mgstage.py', + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + #print(htmlcode) + +print(main('SIRO-3607'))