Merge pull request #168 from jnozsc/normalize_EOL

normalize working tree line endings in Git
This commit is contained in:
Yoshiko
2020-03-27 01:58:18 +08:00
committed by GitHub
8 changed files with 1689 additions and 1689 deletions

View File

@@ -1,121 +1,121 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import requests import requests
from configparser import ConfigParser from configparser import ConfigParser
import os import os
import re import re
import time import time
import sys import sys
from lxml import etree from lxml import etree
import sys import sys
import io import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
# sys.setdefaultencoding('utf-8') # sys.setdefaultencoding('utf-8')
config_file='config.ini' config_file='config.ini'
config = ConfigParser() config = ConfigParser()
if os.path.exists(config_file): if os.path.exists(config_file):
try: try:
config.read(config_file, encoding='UTF-8') config.read(config_file, encoding='UTF-8')
except: except:
print('[-]Config.ini read failed! Please use the offical file!') print('[-]Config.ini read failed! Please use the offical file!')
else: else:
print('[+]config.ini: not found, creating...',end='') print('[+]config.ini: not found, creating...',end='')
with open("config.ini", "wt", encoding='UTF-8') as code: with open("config.ini", "wt", encoding='UTF-8') as code:
print("[common]", file=code) print("[common]", file=code)
print("main_mode = 1", file=code) print("main_mode = 1", file=code)
print("failed_output_folder = failed", file=code) print("failed_output_folder = failed", file=code)
print("success_output_folder = JAV_output", file=code) print("success_output_folder = JAV_output", file=code)
print("", file=code) print("", file=code)
print("[proxy]",file=code) print("[proxy]",file=code)
print("proxy=127.0.0.1:1081",file=code) print("proxy=127.0.0.1:1081",file=code)
print("timeout=10", file=code) print("timeout=10", file=code)
print("retry=3", file=code) print("retry=3", file=code)
print("", file=code) print("", file=code)
print("[Name_Rule]", file=code) print("[Name_Rule]", file=code)
print("location_rule=actor+'/'+number",file=code) print("location_rule=actor+'/'+number",file=code)
print("naming_rule=number+'-'+title",file=code) print("naming_rule=number+'-'+title",file=code)
print("", file=code) print("", file=code)
print("[update]",file=code) print("[update]",file=code)
print("update_check=1",file=code) print("update_check=1",file=code)
print("", file=code) print("", file=code)
print("[media]", file=code) print("[media]", file=code)
print("media_warehouse=emby", file=code) print("media_warehouse=emby", file=code)
print("#emby plex kodi", file=code) print("#emby plex kodi", file=code)
print("", file=code) print("", file=code)
print("[escape]", file=code) print("[escape]", file=code)
print("literals=\\", file=code) print("literals=\\", file=code)
print("", file=code) print("", file=code)
print("[movie_location]", file=code) print("[movie_location]", file=code)
print("path=", file=code) print("path=", file=code)
print("", file=code) print("", file=code)
print('.',end='') print('.',end='')
time.sleep(2) time.sleep(2)
print('.') print('.')
print('[+]config.ini: created!') print('[+]config.ini: created!')
print('[+]Please restart the program!') print('[+]Please restart the program!')
time.sleep(4) time.sleep(4)
os._exit(0) os._exit(0)
try: try:
config.read(config_file, encoding='UTF-8') config.read(config_file, encoding='UTF-8')
except: except:
print('[-]Config.ini read failed! Please use the offical file!') print('[-]Config.ini read failed! Please use the offical file!')
def get_network_settings(): def get_network_settings():
try: try:
proxy = config["proxy"]["proxy"] proxy = config["proxy"]["proxy"]
timeout = int(config["proxy"]["timeout"]) timeout = int(config["proxy"]["timeout"])
retry_count = int(config["proxy"]["retry"]) retry_count = int(config["proxy"]["retry"])
assert timeout > 0 assert timeout > 0
assert retry_count > 0 assert retry_count > 0
except: except:
raise ValueError("[-]Proxy config error! Please check the config.") raise ValueError("[-]Proxy config error! Please check the config.")
return proxy, timeout, retry_count return proxy, timeout, retry_count
def getDataState(json_data): # 元数据获取失败检测 def getDataState(json_data): # 元数据获取失败检测
if json_data['title'] == '' or json_data['title'] == 'None' or json_data['title'] == 'null': if json_data['title'] == '' or json_data['title'] == 'None' or json_data['title'] == 'null':
return 0 return 0
else: else:
return 1 return 1
def ReadMediaWarehouse(): def ReadMediaWarehouse():
return config['media']['media_warehouse'] return config['media']['media_warehouse']
def UpdateCheckSwitch(): def UpdateCheckSwitch():
check=str(config['update']['update_check']) check=str(config['update']['update_check'])
if check == '1': if check == '1':
return '1' return '1'
elif check == '0': elif check == '0':
return '0' return '0'
elif check == '': elif check == '':
return '0' return '0'
def getXpathSingle(htmlcode,xpath): def getXpathSingle(htmlcode,xpath):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result1 = str(html.xpath(xpath)).strip(" ['']") result1 = str(html.xpath(xpath)).strip(" ['']")
return result1 return result1
def get_html(url,cookies = None):#网页请求核心 def get_html(url,cookies = None):#网页请求核心
proxy, timeout, retry_count = get_network_settings() proxy, timeout, retry_count = get_network_settings()
i = 0 i = 0
while i < retry_count: while i < retry_count:
try: try:
if not proxy == '': if not proxy == '':
proxies = {"http": "http://" + proxy,"https": "https://" + proxy} proxies = {"http": "http://" + proxy,"https": "https://" + proxy}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'} headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'}
getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies) getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies)
getweb.encoding = 'utf-8' getweb.encoding = 'utf-8'
return getweb.text return getweb.text
else: else:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies) getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
getweb.encoding = 'utf-8' getweb.encoding = 'utf-8'
return getweb.text return getweb.text
except: except:
i += 1 i += 1
print('[-]Connect retry '+str(i)+'/'+str(retry_count)) print('[-]Connect retry '+str(i)+'/'+str(retry_count))
print('[-]Connect Failed! Please check your Proxy or Network!') print('[-]Connect Failed! Please check your Proxy or Network!')

228
avsox.py
View File

@@ -1,115 +1,115 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'}) a = soup.find_all(attrs={'class': 'avatar-box'})
d = {} d = {}
for i in a: for i in a:
l = i.img['src'] l = i.img['src']
t = i.span.get_text() t = i.span.get_text()
p2 = {t: l} p2 = {t: l}
d.update(p2) d.update(p2)
return d return d
def getTitle(a): def getTitle(a):
try: try:
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '') return result.replace('/', '')
except: except:
return '' return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
soup = BeautifulSoup(a, 'lxml') soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'}) a = soup.find_all(attrs={'class': 'avatar-box'})
d = [] d = []
for i in a: for i in a:
d.append(i.span.get_text()) d.append(i.span.get_text())
return d return d
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1 return result1
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1 return result1
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1 return result1
def getNum(a): def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1 return result1
def getYear(release): def getYear(release):
try: try:
result = str(re.search('\d{4}',release).group()) result = str(re.search('\d{4}',release).group())
return result return result
except: except:
return release return release
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1 return result1
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result return result
def getCover_small(htmlcode): def getCover_small(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result return result
def getTag(a): # 获取演员 def getTag(a): # 获取演员
soup = BeautifulSoup(a, 'lxml') soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'genre'}) a = soup.find_all(attrs={'class': 'genre'})
d = [] d = []
for i in a: for i in a:
d.append(i.get_text()) d.append(i.get_text())
return d return d
def main(number): def main(number):
a = get_html('https://avsox.host/cn/search/' + number) a = get_html('https://avsox.host/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_'))
print(a) print(a)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) a = get_html('https://avsox.host/cn/search/' + number.replace('_', ''))
print(a) print(a)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
web = get_html(result1) web = get_html(result1)
soup = BeautifulSoup(web, 'lxml') soup = BeautifulSoup(web, 'lxml')
info = str(soup.find(attrs={'class': 'row movie'})) info = str(soup.find(attrs={'class': 'row movie'}))
dic = { dic = {
'actor': getActor(web), 'actor': getActor(web),
'title': getTitle(web).strip(getNum(web)), 'title': getTitle(web).strip(getNum(web)),
'studio': getStudio(info), 'studio': getStudio(info),
'outline': '',# 'outline': '',#
'runtime': getRuntime(info), 'runtime': getRuntime(info),
'director': '', # 'director': '', #
'release': getRelease(info), 'release': getRelease(info),
'number': getNum(info), 'number': getNum(info),
'cover': getCover(web), 'cover': getCover(web),
'cover_small': getCover_small(a), 'cover_small': getCover_small(a),
'imagecut': 3, 'imagecut': 3,
'tag': getTag(web), 'tag': getTag(web),
'label': getLabel(info), 'label': getLabel(info),
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(web), 'actor_photo': getActorPhoto(web),
'website': result1, 'website': result1,
'source': 'avsox.py', 'source': 'avsox.py',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
#print(main('012717_472')) #print(main('012717_472'))

1382
core.py

File diff suppressed because it is too large Load Diff

458
fanza.py
View File

@@ -1,229 +1,229 @@
#!/usr/bin/python3 #!/usr/bin/python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import re import re
from lxml import etree from lxml import etree
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(text): def getTitle(text):
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
result = html.xpath('//*[@id="title"]/text()')[0] result = html.xpath('//*[@id="title"]/text()')[0]
return result return result
def getActor(text): def getActor(text):
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
result = ( result = (
str( str(
html.xpath( html.xpath(
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
) )
) )
.strip(" ['']") .strip(" ['']")
.replace("', '", ",") .replace("', '", ",")
) )
return result return result
def getStudio(text): def getStudio(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()" "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/text()" "//td[contains(text(),'メーカー')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getRuntime(text): def getRuntime(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0] result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search(r"\d+", str(result)).group() return re.search(r"\d+", str(result)).group()
def getLabel(text): def getLabel(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getNum(text): def getNum(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()" "//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/text()" "//td[contains(text(),'品番:')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getYear(getRelease): def getYear(getRelease):
try: try:
result = str(re.search(r"\d{4}", getRelease).group()) result = str(re.search(r"\d{4}", getRelease).group())
return result return result
except: except:
return getRelease return getRelease
def getRelease(text): def getRelease(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n") )[0].lstrip("\n")
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/text()" "//td[contains(text(),'発売日:')]/following-sibling::td/text()"
)[0].lstrip("\n") )[0].lstrip("\n")
return result return result
def getTag(text): def getTag(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
) )
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
) )
return result return result
def getCover(text, number): def getCover(text, number):
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
cover_number = number cover_number = number
try: try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except: except:
# sometimes fanza modify _ to \u0005f for image id # sometimes fanza modify _ to \u0005f for image id
if "_" in cover_number: if "_" in cover_number:
cover_number = cover_number.replace("_", r"\u005f") cover_number = cover_number.replace("_", r"\u005f")
try: try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except: except:
# (TODO) handle more edge case # (TODO) handle more edge case
# print(html) # print(html)
# raise exception here, same behavior as before # raise exception here, same behavior as before
# people's major requirement is fetching the picture # people's major requirement is fetching the picture
raise ValueError("can not find image") raise ValueError("can not find image")
return result return result
def getDirector(text): def getDirector(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()" "//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/text()" "//td[contains(text(),'監督:')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getOutline(text): def getOutline(text):
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
try: try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace( result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
"\n", "" "\n", ""
) )
if result == "": if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
"\n", "" "\n", ""
) )
except: except:
# (TODO) handle more edge case # (TODO) handle more edge case
# print(html) # print(html)
return "" return ""
return result return result
def main(number): def main(number):
# fanza allow letter + number + underscore, normalize the input here # fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789 # @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"): if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_") fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [ fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
] ]
chosen_url = "" chosen_url = ""
for url in fanza_urls: for url in fanza_urls:
chosen_url = url + fanza_search_number chosen_url = url + fanza_search_number
htmlcode = get_html(chosen_url) htmlcode = get_html(chosen_url)
if "404 Not Found" not in htmlcode: if "404 Not Found" not in htmlcode:
break break
if "404 Not Found" in htmlcode: if "404 Not Found" in htmlcode:
return json.dumps({"title": "",}) return json.dumps({"title": "",})
try: try:
# for some old page, the input number does not match the page # for some old page, the input number does not match the page
# for example, the url will be cid=test012 # for example, the url will be cid=test012
# but the hinban on the page is test00012 # but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions # so get the hinban first, and then pass it to following functions
fanza_hinban = getNum(htmlcode) fanza_hinban = getNum(htmlcode)
data = { data = {
"title": getTitle(htmlcode).strip(getActor(htmlcode)), "title": getTitle(htmlcode).strip(getActor(htmlcode)),
"studio": getStudio(htmlcode), "studio": getStudio(htmlcode),
"outline": getOutline(htmlcode), "outline": getOutline(htmlcode),
"runtime": getRuntime(htmlcode), "runtime": getRuntime(htmlcode),
"director": getDirector(htmlcode) if "anime" not in chosen_url else "", "director": getDirector(htmlcode) if "anime" not in chosen_url else "",
"actor": getActor(htmlcode) if "anime" not in chosen_url else "", "actor": getActor(htmlcode) if "anime" not in chosen_url else "",
"release": getRelease(htmlcode), "release": getRelease(htmlcode),
"number": fanza_hinban, "number": fanza_hinban,
"cover": getCover(htmlcode, fanza_hinban), "cover": getCover(htmlcode, fanza_hinban),
"imagecut": 1, "imagecut": 1,
"tag": getTag(htmlcode), "tag": getTag(htmlcode),
"label": getLabel(htmlcode), "label": getLabel(htmlcode),
"year": getYear( "year": getYear(
getRelease(htmlcode) getRelease(htmlcode)
), # str(re.search('\d{4}',getRelease(a)).group()), ), # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "", "actor_photo": "",
"website": chosen_url, "website": chosen_url,
"source": "fanza.py", "source": "fanza.py",
} }
except: except:
data = { data = {
"title": "", "title": "",
} }
js = json.dumps( js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
) # .encode('UTF-8') ) # .encode('UTF-8')
return js return js
if __name__ == "__main__": if __name__ == "__main__":
# print(main("DV-1562")) # print(main("DV-1562"))
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
# print(main("ipx292")) # print(main("ipx292"))
pass pass

View File

@@ -1,162 +1,162 @@
import re import re
from lxml import etree#need install from lxml import etree#need install
import json import json
import ADC_function import ADC_function
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(htmlcode): #获取厂商 def getTitle(htmlcode): #获取厂商
#print(htmlcode) #print(htmlcode)
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']")
result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1) result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1)
#print(result2) #print(result2)
return result2 return result2
def getActor(htmlcode): def getActor(htmlcode):
try: try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']")
return result return result
except: except:
return '' return ''
def getStudio(htmlcode): #获取厂商 def getStudio(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
return result return result
def getNum(htmlcode): #获取番号 def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
#print(result) #print(result)
return result return result
def getRelease(htmlcode2): # def getRelease(htmlcode2): #
#a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') #a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
html=etree.fromstring(htmlcode2,etree.HTMLParser()) html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
return result return result
def getCover(htmlcode,number,htmlcode2): #获取厂商 # def getCover(htmlcode,number,htmlcode2): #获取厂商 #
#a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') #a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']")
if result == '': if result == '':
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']") result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']")
return 'https://fc2club.com' + result2 return 'https://fc2club.com' + result2
return 'http:' + result return 'http:' + result
def getOutline(htmlcode2): #获取番号 # def getOutline(htmlcode2): #获取番号 #
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
return result return result
def getTag(htmlcode): #获取番号 def getTag(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()')) result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()'))
return result.strip(" ['']").replace("'",'').replace(' ','') return result.strip(" ['']").replace("'",'').replace(' ','')
def getYear(release): def getYear(release):
try: try:
result = re.search('\d{4}',release).group() result = re.search('\d{4}',release).group()
return result return result
except: except:
return '' return ''
def getTitle_fc2com(htmlcode): #获取厂商 def getTitle_fc2com(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0] result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
return result return result
def getActor_fc2com(htmlcode): def getActor_fc2com(htmlcode):
try: try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
return result return result
except: except:
return '' return ''
def getStudio_fc2com(htmlcode): #获取厂商 def getStudio_fc2com(htmlcode): #获取厂商
try: try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']") result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
return result return result
except: except:
return '' return ''
def getNum_fc2com(htmlcode): #获取番号 def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result return result
def getRelease_fc2com(htmlcode2): # def getRelease_fc2com(htmlcode2): #
html=etree.fromstring(htmlcode2,etree.HTMLParser()) html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
return result return result
def getCover_fc2com(htmlcode2): #获取厂商 # def getCover_fc2com(htmlcode2): #获取厂商 #
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
return 'http:' + result return 'http:' + result
def getOutline_fc2com(htmlcode2): #获取番号 # def getOutline_fc2com(htmlcode2): #获取番号 #
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
return result return result
def getTag_fc2com(number): #获取番号 def getTag_fc2com(number): #获取番号
htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape')) htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape'))
result = re.findall('"tag":"(.*?)"', htmlcode) result = re.findall('"tag":"(.*?)"', htmlcode)
return result return result
def getYear_fc2com(release): def getYear_fc2com(release):
try: try:
result = re.search('\d{4}',release).group() result = re.search('\d{4}',release).group()
return result return result
except: except:
return '' return ''
def main(number): def main(number):
try: try:
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/') htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/')
htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html') htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html')
actor = getActor(htmlcode) actor = getActor(htmlcode)
if getActor(htmlcode) == '': if getActor(htmlcode) == '':
actor = 'FC2系列' actor = 'FC2系列'
dic = { dic = {
'title': getTitle(htmlcode), 'title': getTitle(htmlcode),
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': '',#str(re.search('\d{4}',getRelease(number)).group()), 'year': '',#str(re.search('\d{4}',getRelease(number)).group()),
'outline': '',#getOutline(htmlcode2), 'outline': '',#getOutline(htmlcode2),
'runtime': getYear(getRelease(htmlcode)), 'runtime': getYear(getRelease(htmlcode)),
'director': getStudio(htmlcode), 'director': getStudio(htmlcode),
'actor': actor, 'actor': actor,
'release': getRelease(number), 'release': getRelease(number),
'number': 'FC2-'+number, 'number': 'FC2-'+number,
'label': '', 'label': '',
'cover': getCover(htmlcode,number,htmlcode2), 'cover': getCover(htmlcode,number,htmlcode2),
'imagecut': 0, 'imagecut': 0,
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
'actor_photo':'', 'actor_photo':'',
'website': 'https://fc2club.com//html/FC2-' + number + '.html', 'website': 'https://fc2club.com//html/FC2-' + number + '.html',
'source':'https://fc2club.com//html/FC2-' + number + '.html', 'source':'https://fc2club.com//html/FC2-' + number + '.html',
} }
if dic['title'] == '': if dic['title'] == '':
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'}) htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'})
actor = getActor(htmlcode) actor = getActor(htmlcode)
if getActor(htmlcode) == '': if getActor(htmlcode) == '':
actor = 'FC2系列' actor = 'FC2系列'
dic = { dic = {
'title': getTitle_fc2com(htmlcode2), 'title': getTitle_fc2com(htmlcode2),
'studio': getStudio_fc2com(htmlcode2), 'studio': getStudio_fc2com(htmlcode2),
'year': '', # str(re.search('\d{4}',getRelease(number)).group()), 'year': '', # str(re.search('\d{4}',getRelease(number)).group()),
'outline': getOutline_fc2com(htmlcode2), 'outline': getOutline_fc2com(htmlcode2),
'runtime': getYear_fc2com(getRelease(htmlcode2)), 'runtime': getYear_fc2com(getRelease(htmlcode2)),
'director': getStudio_fc2com(htmlcode2), 'director': getStudio_fc2com(htmlcode2),
'actor': actor, 'actor': actor,
'release': getRelease_fc2com(number), 'release': getRelease_fc2com(number),
'number': 'FC2-' + number, 'number': 'FC2-' + number,
'cover': getCover_fc2com(htmlcode2), 'cover': getCover_fc2com(htmlcode2),
'imagecut': 0, 'imagecut': 0,
'tag': getTag_fc2com(number), 'tag': getTag_fc2com(number),
'label': '', 'label': '',
'actor_photo': '', 'actor_photo': '',
'website': 'http://adult.contents.fc2.com/article/' + number + '/', 'website': 'http://adult.contents.fc2.com/article/' + number + '/',
'source': 'http://adult.contents.fc2.com/article/' + number + '/', 'source': 'http://adult.contents.fc2.com/article/' + number + '/',
} }
except Exception as e: except Exception as e:
# (TODO) better handle this # (TODO) better handle this
# print(e) # print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
return js return js
#print(main('1252953')) #print(main('1252953'))

276
javbus.py
View File

@@ -1,138 +1,138 @@
import re import re
from pyquery import PyQuery as pq#need install from pyquery import PyQuery as pq#need install
from lxml import etree#need install from lxml import etree#need install
from bs4 import BeautifulSoup#need install from bs4 import BeautifulSoup#need install
import json import json
from ADC_function import * from ADC_function import *
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'star-name'}) a = soup.find_all(attrs={'class': 'star-name'})
d={} d={}
for i in a: for i in a:
l=i.a['href'] l=i.a['href']
t=i.get_text() t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(l), etree.HTMLParser())
p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
return d return d
def getTitle(htmlcode): #获取标题 def getTitle(htmlcode): #获取标题
doc = pq(htmlcode) doc = pq(htmlcode)
title=str(doc('div.container h3').text()).replace(' ','-') title=str(doc('div.container h3').text()).replace(' ','-')
try: try:
title2 = re.sub('n\d+-','',title) title2 = re.sub('n\d+-','',title)
return title2 return title2
except: except:
return title return title
def getStudio(htmlcode): #获取厂商 def getStudio(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
return result return result
def getYear(htmlcode): #获取年份 def getYear(htmlcode): #获取年份
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result return result
def getCover(htmlcode): #获取封面链接 def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode) doc = pq(htmlcode)
image = doc('a.bigImage') image = doc('a.bigImage')
return image.attr('href') return image.attr('href')
def getRelease(htmlcode): #获取出版日期 def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result return result
def getRuntime(htmlcode): #获取分钟 def getRuntime(htmlcode): #获取分钟
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find(text=re.compile('分鐘')) a = soup.find(text=re.compile('分鐘'))
return a return a
def getActor(htmlcode): #获取女优 def getActor(htmlcode): #获取女优
b=[] b=[]
soup=BeautifulSoup(htmlcode,'lxml') soup=BeautifulSoup(htmlcode,'lxml')
a=soup.find_all(attrs={'class':'star-name'}) a=soup.find_all(attrs={'class':'star-name'})
for i in a: for i in a:
b.append(i.get_text()) b.append(i.get_text())
return b return b
def getNum(htmlcode): #获取番号 def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result return result
def getDirector(htmlcode): #获取导演 def getDirector(htmlcode): #获取导演
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
return result return result
def getOutline(htmlcode): #获取演员 def getOutline(htmlcode): #获取演员
doc = pq(htmlcode) doc = pq(htmlcode)
result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text())
return result return result
def getSerise(htmlcode): def getSerise(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
return result return result
def getTag(htmlcode): # 获取演员 def getTag(htmlcode): # 获取演员
tag = [] tag = []
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'genre'}) a = soup.find_all(attrs={'class': 'genre'})
for i in a: for i in a:
if 'onmouseout' in str(i): if 'onmouseout' in str(i):
continue continue
tag.append(i.get_text()) tag.append(i.get_text())
return tag return tag
def main(number): def main(number):
try: try:
htmlcode = get_html('https://www.javbus.com/' + number) htmlcode = get_html('https://www.javbus.com/' + number)
try: try:
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
except: except:
dww_htmlcode = '' dww_htmlcode = ''
dic = { dic = {
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': str(re.search('\d{4}', getYear(htmlcode)).group()), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
'outline': getOutline(dww_htmlcode), 'outline': getOutline(dww_htmlcode),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode), 'director': getDirector(htmlcode),
'actor': getActor(htmlcode), 'actor': getActor(htmlcode),
'release': getRelease(htmlcode), 'release': getRelease(htmlcode),
'number': getNum(htmlcode), 'number': getNum(htmlcode),
'cover': getCover(htmlcode), 'cover': getCover(htmlcode),
'imagecut': 1, 'imagecut': 1,
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
'label': getSerise(htmlcode), 'label': getSerise(htmlcode),
'actor_photo': getActorPhoto(htmlcode), 'actor_photo': getActorPhoto(htmlcode),
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source' : 'javbus.py', 'source' : 'javbus.py',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
except: except:
return main_uncensored(number) return main_uncensored(number)
def main_uncensored(number): def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/' + number) htmlcode = get_html('https://www.javbus.com/' + number)
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
if getTitle(htmlcode) == '': if getTitle(htmlcode) == '':
htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_'))
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
dic = { dic = {
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': getYear(htmlcode), 'year': getYear(htmlcode),
'outline': getOutline(dww_htmlcode), 'outline': getOutline(dww_htmlcode),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode), 'director': getDirector(htmlcode),
'actor': getActor(htmlcode), 'actor': getActor(htmlcode),
'release': getRelease(htmlcode), 'release': getRelease(htmlcode),
'number': getNum(htmlcode), 'number': getNum(htmlcode),
'cover': getCover(htmlcode), 'cover': getCover(htmlcode),
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
'label': getSerise(htmlcode), 'label': getSerise(htmlcode),
'imagecut': 0, 'imagecut': 0,
'actor_photo': '', 'actor_photo': '',
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py', 'source': 'javbus.py',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js

246
javdb.py
View File

@@ -1,123 +1,123 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result = html.xpath("/html/body/section/div/h2/strong/text()")[0] result = html.xpath("/html/body/section/div/h2/strong/text()")[0]
return result return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',') a = actor.split(',')
d={} d={}
for i in a: for i in a:
p={i:''} p={i:''}
d.update(p) d.update(p)
return d return d
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a): def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+') return str(result2 + result1).strip('+')
def getYear(getRelease): def getYear(getRelease):
try: try:
result = str(re.search('\d{4}', getRelease).group()) result = str(re.search('\d{4}', getRelease).group())
return result return result
except: except:
return getRelease return getRelease
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',') return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',')
def getCover_small(a, index=0): def getCover_small(a, index=0):
# same issue mentioned below, # same issue mentioned below,
# javdb sometime returns multiple results # javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number # DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result: if not 'https' in result:
result = 'https:' + result result = 'https:' + result
return result return result
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath("//div[@class='column column-video-cover']/a/img/@src")).strip(" ['']") result = str(html.xpath("//div[@class='column column-video-cover']/a/img/@src")).strip(" ['']")
return result return result
def getDirector(a): def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(htmlcode): def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return result return result
def main(number): def main(number):
try: try:
number = number.upper() number = number.upper()
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results, # javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for # and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one # iterate all candidates and find the match one
urls = html.xpath('//*[@id="videos"]/div/div/a/@href') urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
correct_url = urls[ids.index(number)] correct_url = urls[ids.index(number)]
detail_page = get_html('https://javdb.com' + correct_url) detail_page = get_html('https://javdb.com' + correct_url)
dic = { dic = {
'actor': getActor(detail_page), 'actor': getActor(detail_page),
'title': getTitle(detail_page), 'title': getTitle(detail_page),
'studio': getStudio(detail_page), 'studio': getStudio(detail_page),
'outline': getOutline(detail_page), 'outline': getOutline(detail_page),
'runtime': getRuntime(detail_page), 'runtime': getRuntime(detail_page),
'director': getDirector(detail_page), 'director': getDirector(detail_page),
'release': getRelease(detail_page), 'release': getRelease(detail_page),
'number': getNum(detail_page), 'number': getNum(detail_page),
'cover': getCover(detail_page), 'cover': getCover(detail_page),
'cover_small': getCover_small(query_result, index=ids.index(number)), 'cover_small': getCover_small(query_result, index=ids.index(number)),
'imagecut': 3, 'imagecut': 3,
'tag': getTag(detail_page), 'tag': getTag(detail_page),
'label': getLabel(detail_page), 'label': getLabel(detail_page),
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(getActor(detail_page)), 'actor_photo': getActorPhoto(getActor(detail_page)),
'website': 'https://javdb.com' + correct_url, 'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py', 'source': 'javdb.py',
} }
except Exception as e: except Exception as e:
# print(e) # print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
# main('DV-1562') # main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
#print(main('ipx-292')) #print(main('ipx-292'))

View File

@@ -1,111 +1,111 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): def getTitle(a):
try: try:
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
return result.replace('/', ',') return result.replace('/', ',')
except: except:
return '' return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','') return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','') return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getNum(a): def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+')
def getYear(getRelease): def getYear(getRelease):
try: try:
result = str(re.search('\d{4}',getRelease).group()) result = str(re.search('\d{4}',getRelease).group())
return result return result
except: except:
return getRelease return getRelease
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','')
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src # /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
return result return result
def getDirector(a): def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','') return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getOutline(htmlcode): def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//p/text()')).strip(" ['']") result = str(html.xpath('//p/text()')).strip(" ['']")
return result return result
def main(number2): def main(number2):
number=number2.upper() number=number2.upper()
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
print(b) print(b)
dic = { dic = {
'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''),
'studio': getStudio(a), 'studio': getStudio(a),
'outline': getOutline(b), 'outline': getOutline(b),
'runtime': getRuntime(a), 'runtime': getRuntime(a),
'director': getDirector(a), 'director': getDirector(a),
'actor': getActor(a), 'actor': getActor(a),
'release': getRelease(a), 'release': getRelease(a),
'number': getNum(a), 'number': getNum(a),
'cover': getCover(htmlcode), 'cover': getCover(htmlcode),
'imagecut': 0, 'imagecut': 0,
'tag': getTag(a), 'tag': getTag(a),
'label':getLabel(a), 'label':getLabel(a),
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '', 'actor_photo': '',
'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
'source': 'mgstage.py', 'source': 'mgstage.py',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
#print(htmlcode) #print(htmlcode)
print(main('SIRO-3607')) print(main('SIRO-3607'))