Merge branch 'upstream'

# Conflicts:
#	WebCrawler/fanza.py
This commit is contained in:
Deng Zhou
2022-04-29 23:53:21 +08:00
34 changed files with 25968 additions and 18336 deletions

View File

@@ -24,6 +24,7 @@ from . import carib
from . import fc2club
from . import mv91
from . import madou
from . import gcolle
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
@@ -62,7 +63,8 @@ def get_data_from_json(file_number, oCC):
"carib": carib.main,
"fc2club": fc2club.main,
"mv91": mv91.main,
"madou": madou.main
"madou": madou.main,
"gcolle": gcolle.main,
}
conf = config.getInstance()
@@ -91,6 +93,8 @@ def get_data_from_json(file_number, oCC):
sources.insert(0, sources.pop(sources.index("fc2")))
if "fc2club" in sources:
sources.insert(0, sources.pop(sources.index("fc2club")))
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources.insert(0, sources.pop(sources.index("gcolle")))
elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number
):
@@ -100,6 +104,12 @@ def get_data_from_json(file_number, oCC):
sources.insert(0, sources.pop(sources.index("javdb")))
if "xcity" in sources:
sources.insert(0, sources.pop(sources.index("xcity")))
if "madou" in sources:
sources.insert(0, sources.pop(sources.index("madou")))
elif "madou" in sources and (
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
):
sources.insert(0, sources.pop(sources.index("madou")))
# check sources in func_mapping
todel = []
@@ -124,7 +134,10 @@ def get_data_from_json(file_number, oCC):
for source in sources:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
try:
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
except:
json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -136,7 +149,10 @@ def get_data_from_json(file_number, oCC):
try:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(func_mapping[source](file_number))
try:
json_data = json.loads(func_mapping[source](file_number))
except:
json_data = func_mapping[source](file_number)
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -242,8 +258,8 @@ def get_data_from_json(file_number, oCC):
if json_data[translate_value] == "":
continue
if translate_value == "title":
title_dict = json.load(
open(str(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json'), 'r', encoding="utf-8"))
title_dict = json.loads(
(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
try:
json_data[translate_value] = title_dict[number]
continue

View File

@@ -5,6 +5,7 @@ from lxml import etree
import json
from ADC_function import *
from WebCrawler.storyline import getStoryline
from WebCrawler.crawler import *
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -17,95 +18,64 @@ def getActorPhoto(html):
p2 = {t: l}
d.update(p2)
return d
def getTitle(html):
try:
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '')
except:
return ''
def getActor(html):
a = html.xpath('//a[@class="avatar-box"]')
d = []
for i in a:
d.append(i.find('span').text)
return d
def getStudio(html):
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1
def getRuntime(html):
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1
def getLabel(html):
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1
def getNum(html):
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1
def getYear(release):
try:
result = str(re.search('\d{4}',release).group())
return result
except:
return release
def getRelease(html):
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1
def getCover(html):
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result
def getCover_small(html):
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result
def getTag(html):
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x[2:]] if len(x) > 2 else []
def getSeries(html):
try:
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1
except:
return ''
def main(number):
html = get_html('https://tellme.pw/avsox')
site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
a = get_html(site + '/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', ''))
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
detail = get_html("https:" + result1)
lx = etree.fromstring(detail, etree.HTMLParser())
avsox_crawler2 = Crawler(a)
avsox_crawler = Crawler(detail)
try:
new_number = getNum(lx)
new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
if new_number.upper() != number.upper():
raise ValueError('number not found')
title = getTitle(lx).strip(new_number)
title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
dic = {
'actor': getActor(lx),
'title': title,
'studio': getStudio(lx),
'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
'outline': getStoryline(number, title),
'runtime': getRuntime(lx),
'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
'director': '', #
'release': getRelease(lx),
'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
'number': new_number,
'cover': getCover(lx),
'cover_small': getCover_small(html),
'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
#'cover_small' : getCover_small(html),
'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
'imagecut': 3,
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(getRelease(lx)),
'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
'actor_photo': getActorPhoto(lx),
'website': "https:" + result1,
'source': 'avsox.py',
'series': getSeries(lx),
'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
}
except Exception as e:
if config.getInstance().debug():

View File

@@ -40,6 +40,7 @@ def main(number: str) -> json:
'website': f'{G_SITE}/moviepages/{number}/index.html',
'source': 'carib.py',
'series': get_series(lx),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
@@ -59,7 +60,7 @@ def get_year(lx: html.HtmlElement) -> str:
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title)
g = getStoryline(number, title, 无码=True)
if len(g):
return g
return o

28
WebCrawler/crawler.py Normal file
View File

@@ -0,0 +1,28 @@
from lxml import etree
class Crawler:
def __init__(self,htmlcode):
self.html = etree.HTML(htmlcode)
def getString(self,_xpath):
if _xpath == "":
return ""
result = self.html.xpath(_xpath)
try:
return result[0]
except:
return ""
def getStrings(self,_xpath):
result = self.html.xpath(_xpath)
try:
return result
except:
return ""
def getOutline(self,_xpath):
result = self.html.xpath(_xpath)
try:
return "\n".join(result)
except:
return ""

View File

@@ -1,15 +1,14 @@
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
import sys
sys.path.append('../')
from ADC_function import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html'))
#title //*[@id="work_name"]/a/text()
#print(get_html('https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html'))
#title /html/head/title/text()
#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
#release //th[contains(text(),"販売日")]/../td/a/text()
#story //th[contains(text(),"シナリオ")]/../td/a/text()
@@ -18,14 +17,14 @@ from ADC_function import *
#jianjie //*[@id="main_inner"]/div[3]/text()
#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src
#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html
#https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html
def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser())
result = html.xpath('//*[@id="work_name"]/a/text()')[0]
def getTitle(html):
result = str(html.xpath('/html/head/title/text()')[0])
result = result[:result.rfind(' | DLsite')]
result = result[:result.rfind(' [')]
return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except:
@@ -38,8 +37,7 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
p={i:''}
d.update(p)
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getStudio(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -53,8 +51,7 @@ def getRuntime(a):
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getLabel(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -69,12 +66,10 @@ def getYear(getRelease):
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRelease(html):
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
return result1.replace('','-').replace('','-').replace('','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getTag(html):
try:
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result
@@ -96,26 +91,22 @@ def getCover_small(a, index=0):
if not 'https' in result:
result = 'https:' + result
return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0]
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getCover(html):
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
return result.replace('.webp', '.jpg')
def getDirector(html):
try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except:
result = ''
return result
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getOutline(html):
total = []
result = html.xpath('//*[@id="main_inner"]/div[3]/text()')
result = html.xpath('//*[@class="work_parts_area"]/p/text()')
for i in result:
total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getSeries(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -127,28 +118,28 @@ def getSeries(a):
def main(number):
try:
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN',
cookies={'locale': 'zh-cn'})
html = etree.fromstring(htmlcode, etree.HTMLParser())
dic = {
'actor': getActor(htmlcode),
'title': getTitle(htmlcode),
'studio': getStudio(htmlcode),
'outline': getOutline(htmlcode),
'actor': getActor(html),
'title': getTitle(html),
'studio': getStudio(html),
'outline': getOutline(html),
'runtime': '',
'director': getDirector(htmlcode),
'release': getRelease(htmlcode),
'director': getDirector(html),
'release': getRelease(html),
'number': number,
'cover': 'https:' + getCover(htmlcode),
'cover': 'https:' + getCover(html),
'cover_small': '',
'imagecut': 0,
'tag': getTag(htmlcode),
'label': getLabel(htmlcode),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
'tag': getTag(html),
'label': getLabel(html),
'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
'source': 'dlsite.py',
'series': getSeries(htmlcode),
'series': getSeries(html),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -166,4 +157,6 @@ def main(number):
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.getInstance().set_override("debug_mode:switch=1")
print(main('VJ013178'))
print(main('RJ329607'))

View File

@@ -9,130 +9,33 @@ from urllib.parse import urlencode
from lxml import etree
from ADC_function import *
from WebCrawler.crawler import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
class fanzaCrawler(Crawler):
def getFanzaString(self,string):
result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
return result1+result2
def getTitle(text):
html = etree.fromstring(text, etree.HTMLParser())
result = html.xpath('//*[starts-with(@id, "title")]/text()')[0]
return result
def getFanzaStrings(self, string):
result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
if len(result1) > 0:
return result1
result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
return result2
def getActor(text):
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(text, etree.HTMLParser())
result = (
str(
html.xpath(
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
)
)
.strip(" ['']")
.replace("', '", ",")
)
return result
def getRelease(fanza_Crawler):
result = fanza_Crawler.getFanzaString('発売日:')
if result == '----':
result = fanza_Crawler.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n')
def getStudio(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
)[0]
return result
def getRuntime(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search(r"\d+", str(result)).group()
def getLabel(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'レーベル:')]/following-sibling::td/text()"
)[0]
return result
def getNum(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
)[0]
return result
def getYear(getRelease):
try:
result = str(re.search(r"\d{4}", getRelease).group())
return result
except:
return getRelease
def getRelease(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except:
try:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
)[0].lstrip("\n")
except:
result = "----"
if result == "----":
try:
result = html.xpath(
"//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except:
try:
result = html.xpath(
"//td[contains(text(),'配信開始日:')]/following-sibling::td/text()"
)[0].lstrip("\n")
except:
pass
return result.replace("/", "-")
def getTag(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
)
return result
except:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
)
return result
def getCover(text, number):
html = etree.fromstring(text, etree.HTMLParser())
def getCover(html, number):
cover_number = number
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
@@ -151,29 +54,11 @@ def getCover(text, number):
return result
def getDirector(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getOutline(html):
try:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
)[0]
return result
def getOutline(text):
html = etree.fromstring(text, etree.HTMLParser())
try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
"\n", ""
)
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
"\n", ""
)
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
except:
# (TODO) handle more edge case
# print(html)
@@ -181,23 +66,8 @@ def getOutline(text):
return result
def getSeries(text):
try:
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
)[0]
return result
except:
return ""
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div></div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
@@ -232,6 +102,7 @@ def main(number):
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
]
chosen_url = ""
fanza_Crawler = ''
for url in fanza_urls:
chosen_url = url + fanza_search_number
@@ -240,6 +111,7 @@ def main(number):
urlencode({"rurl": chosen_url})
)
)
fanza_Crawler = fanzaCrawler(htmlcode)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
@@ -249,28 +121,34 @@ def main(number):
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
fanza_hinban = getNum(htmlcode)
fanza_hinban = fanza_Crawler.getFanzaString('品番:')
out_num = fanza_hinban
number_lo = number.lower()
html = etree.fromstring(htmlcode, etree.HTMLParser())
if (re.sub('-|_', '', number_lo) == fanza_hinban or
number_lo.replace('-', '00') == fanza_hinban or
number_lo.replace('-', '') + 'so' == fanza_hinban
):
out_num = number
data = {
"title": getTitle(htmlcode).strip(),
"studio": getStudio(htmlcode),
"outline": getOutline(htmlcode),
"runtime": getRuntime(htmlcode),
"director": getDirector(htmlcode) if "anime" not in chosen_url else "",
"actor": getActor(htmlcode) if "anime" not in chosen_url else "",
"release": getRelease(htmlcode),
"number": fanza_hinban,
"cover": getCover(htmlcode, fanza_hinban),
"title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
"studio": fanza_Crawler.getFanzaString('メーカー'),
"outline": getOutline(html),
"runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
"director": fanza_Crawler.getFanzaString('監督:') if "anime" not in chosen_url else "",
"actor": fanza_Crawler.getString("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()").replace("', '", ",") if "anime" not in chosen_url else "",
"release": getRelease(fanza_Crawler),
"number": out_num,
"cover": getCover(html, fanza_hinban),
"imagecut": 1,
"tag": getTag(htmlcode),
"tag": fanza_Crawler.getFanzaStrings('ジャンル:'),
"extrafanart": getExtrafanart(htmlcode),
"label": getLabel(htmlcode),
"year": getYear(
getRelease(htmlcode)
), # str(re.search('\d{4}',getRelease(a)).group()),
"label": fanza_Crawler.getFanzaString('レーベル'),
"year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "",
"website": chosen_url,
"source": "fanza.py",
"series": getSeries(htmlcode),
"series": fanza_Crawler.getFanzaString('シリーズ:'),
}
except:
data = {
@@ -314,4 +192,6 @@ def main_htmlcode(number):
if __name__ == "__main__":
# print(main("DV-1562"))
# print(main("96fad1217"))
print(main("h_173ghmt68"))
print(main("pred00251"))
print(main("MIAA-391"))
print(main("OBA-326"))

View File

@@ -4,58 +4,11 @@ import re
from lxml import etree#need install
import json
import ADC_function
from WebCrawler.crawler import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle_fc2com(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
return result
def getActor_fc2com(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
return result
except:
return ''
def getStudio_fc2com(htmlcode): #获取厂商
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
return result
except:
return ''
def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result
def getRelease_fc2com(htmlcode2): #
html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()')).strip(" ['販売日 : ']").replace('/','-')
return result
def getCover_fc2com(htmlcode2): #获取厂商 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
return 'http:' + result
# def getOutline_fc2com(htmlcode2): #获取番号 #
# xpath_html = etree.fromstring(htmlcode2, etree.HTMLParser())
# path = str(xpath_html.xpath('//*[@id="top"]/div[1]/section[4]/iframe/@src')).strip(" ['']")
# html = etree.fromstring(ADC_function.get_html('https://adult.contents.fc2.com/'+path), etree.HTMLParser())
# print('https://adult.contents.fc2.com'+path)
# print(ADC_function.get_html('https://adult.contents.fc2.com'+path,cookies={'wei6H':'1'}))
# result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
# return result
def getTag_fc2com(lx):
result = lx.xpath("//a[@class='tag tagTag']/text()")
return result
def getYear_fc2com(release):
try:
result = re.search('\d{4}',release).group()
return result
except:
return ''
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
html = html_pather.search(htmlcode)
@@ -79,27 +32,30 @@ def getTrailer(htmlcode, number):
except:
return ''
else:
video_url = ''
return ''
def main(number):
try:
number = number.replace('FC2-', '').replace('fc2-', '')
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/')
actor = getActor_fc2com(htmlcode2)
if not actor:
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
fc2_crawler = Crawler(htmlcode2)
actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
if actor == "":
actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
strip(" ['販売日 : ']").replace('/','-')
dic = {
'title': lx.xpath('/html/head/title/text()')[0],
'studio': getStudio_fc2com(htmlcode2),
'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
'title': fc2_crawler.getString('/html/head/title/text()'),
'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'year': re.findall('\d{4}',release)[0],
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
'director': getStudio_fc2com(htmlcode2),
'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'actor': actor,
'release': getRelease_fc2com(htmlcode2),
'release': release,
'number': 'FC2-' + number,
'label': '',
'cover': cover,
@@ -107,7 +63,7 @@ def main(number):
'extrafanart': getExtrafanart(htmlcode2),
"trailer": getTrailer(htmlcode2, number),
'imagecut': 0,
'tag': getTag_fc2com(lx),
'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
'actor_photo': '',
'website': 'https://adult.contents.fc2.com/article/' + number + '/',
'source': 'https://adult.contents.fc2.com/article/' + number + '/',
@@ -121,6 +77,4 @@ def main(number):
return js
if __name__ == '__main__':
print(main('FC2-1787685'))
print(main('FC2-2086710'))
print(main('FC2-2182382'))

88
WebCrawler/gcolle.py Normal file
View File

@@ -0,0 +1,88 @@
import sys
sys.path.append('../')
from WebCrawler.crawler import *
from ADC_function import *
from lxml import etree
def main(number):
save_cookies = False
cookie_filename = 'gcolle.json'
try:
gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
session = get_html_session(cookies=gcolle_cooikes)
number = number.upper().replace('GCOLLE-','')
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
gcolle_crawler = Crawler(htmlcode)
r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
if r18_continue and r18_continue.startswith('http'):
htmlcode = session.get(r18_continue).text
gcolle_crawler = Crawler(htmlcode)
save_cookies = True
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
if number != number_html:
raise Exception('[-]gcolle.py: number not match')
if save_cookies:
cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
cookies_save.parent.mkdir(parents=True, exist_ok=True)
cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
# get extrafanart url
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
else:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
# Add "https:" in each extrafanart url
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
dic = {
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
"runtime": '',
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"number": "GCOLLE-" + str(number_html),
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"trailer": '',
"actor_photo":'',
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
"extrafanart":extrafanart,
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
"source": 'gcolle.py',
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
'无码': False,
}
# for k,v in dic.items():
# if k == 'outline':
# print(k,len(v))
# else:
# print(k,v)
# print('===============================================================')
except Exception as e:
dic = {'title':''}
if config.getInstance().debug():
print(e)
return dic
if __name__ == '__main__':
from pprint import pprint
config.getInstance().set_override("debug_mode:switch=1")
pprint(main('840724'))
pprint(main('840386'))
pprint(main('838671'))
pprint(main('814179'))
pprint(main('834255'))
pprint(main('814179'))

View File

@@ -56,9 +56,9 @@ def parse_info(soup: BeautifulSoup) -> dict:
"label": get_label(data_dic),
"studio": get_studio(data_dic),
"tag": get_tag(data_dic),
"number": get_number(data_dic),
"number": get_number(data_dic).upper(),
"release": get_release(data_dic),
"runtime": get_runtime(data_dic),
"runtime": get_runtime(data_dic).replace(" minutes", ""),
"series": get_series(data_dic),
}
else:

View File

@@ -60,10 +60,10 @@ def getCID(html):
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string)
return result
def getOutline(number, title): #获取剧情介绍 多进程并发查询
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
return getStoryline(number,title)
return getStoryline(number,title, 无码=uncensored)
def getSeriseJa(html):
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
return str(x[0]) if len(x) else ''
@@ -83,9 +83,13 @@ def getExtrafanart(htmlcode): # 获取剧照
if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def getUncensored(html):
x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
return bool(x)
def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/ja/' + number)
w_number = number.replace('.', '-')
htmlcode = get_html('https://www.javbus.red/' + w_number)
if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found')
lx = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -94,7 +98,7 @@ def main_uncensored(number):
'title': title,
'studio': getStudioJa(lx),
'year': getYear(lx),
'outline': getOutline(number, title),
'outline': getOutline(w_number, title, True),
'runtime': getRuntime(lx),
'director': getDirectorJa(lx),
'actor': getActor(lx),
@@ -106,9 +110,10 @@ def main_uncensored(number):
'label': getSeriseJa(lx),
'imagecut': 0,
# 'actor_photo': '',
'website': 'https://www.javbus.com/ja/' + number,
'website': 'https://www.javbus.red/' + w_number,
'source': 'javbus.py',
'series': getSeriseJa(lx),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -136,7 +141,7 @@ def main(number):
'title': title,
'studio': getStudio(lx),
'year': getYear(lx),
'outline': getOutline(number, title),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'actor': getActor(lx),
@@ -151,6 +156,7 @@ def main(number):
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
'series': getSerise(lx),
'无码': getUncensored(lx)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -168,13 +174,14 @@ def main(number):
return js
if __name__ == "__main__" :
config.G_conf_override['debug_mode:switch'] = True
print(main('ABP-888'))
print(main('ABP-960'))
print(main('ADV-R0624')) # 404
print(main('MMNT-010'))
print(main('ipx-292'))
print(main('CEMD-011'))
print(main('CJOD-278'))
config.getInstance().set_override("debug_mode:switch=1")
# print(main('ABP-888'))
# print(main('ABP-960'))
# print(main('ADV-R0624')) # 404
# print(main('MMNT-010'))
# print(main('ipx-292'))
# print(main('CEMD-011'))
# print(main('CJOD-278'))
print(main('BrazzersExxtra.21.02.01'))
print(main('100221_001'))
print(main('AVSW-061'))

View File

@@ -166,12 +166,23 @@ def getDirector(html):
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(number, title): #获取剧情介绍 多进程并发查询
return getStoryline(number,title)
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
return getStoryline(number, title, 无码=uncensored)
def getSeries(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getUserRating(html):
try:
result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0]), int(v[0][1])
except:
return
def getUncensored(html):
x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
' or contains(@href,"/tags/western?")]')
return bool(x)
def main(number):
# javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点
@@ -276,7 +287,7 @@ def main(number):
'actor': getActor(lx),
'title': title,
'studio': getStudio(detail_page, lx),
'outline': getOutline(number, title),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(detail_page),
@@ -293,8 +304,12 @@ def main(number):
'website': urljoin('https://javdb.com', correct_url),
'source': 'javdb.py',
'series': getSeries(lx),
'无码': getUncensored(lx)
}
userrating = getUserRating(lx)
if isinstance(userrating, tuple) and len(userrating) == 2:
dic['用户评分'] = userrating[0]
dic['评分人数'] = userrating[1]
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
dic['actor'].append('素人')
if not dic['series']:
@@ -313,18 +328,19 @@ def main(number):
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.G_conf_override['debug_mode:switch'] = True
config.getInstance().set_override("debug_mode:switch=1")
# print(main('blacked.20.05.30'))
# print(main('AGAV-042'))
# print(main('BANK-022'))
# print(main('070116-197'))
print(main('070116-197'))
# print(main('093021_539')) # 没有剧照 片商pacopacomama
#print(main('FC2-2278260'))
# print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found
#print(main('MVSD-439'))
# print(main('EHM0001')) # not found
print(main('FC2-2314275'))
#print(main('FC2-2314275'))
# print(main('EBOD-646'))
# print(main('LOVE-262'))
#print(main('ABP-890'))
print(main('ABP-890'))
print(main('blacked.14.12.08'))

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
from bs4 import BeautifulSoup # need install
from lxml import etree # need install
from pyquery import PyQuery as pq # need install
@@ -5,24 +7,22 @@ from ADC_function import *
import json
import re
from lib2to3.pgen2 import parse
import sys
from urllib.parse import urlparse, unquote
sys.path.append('../')
def getActorPhoto(html):
return ''
def getTitle(html, number): # 获取标题
title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
try:
result = str(re.split(r'[/||-]', title)[1])
return result.strip()
except:
return title.replace(number.upper(), '').strip()
def getTitle(html): # 获取标题
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
# <title>MD0094贫嘴贱舌中出大嫂坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
# <title>TM0002-我的痴女女友-麻豆社</title>
browser_title = str(html.xpath("/html/head/title/text()")[0])
title = str(re.findall(r'^[A-Z0-9 /\-]*(.*)-麻豆社$', browser_title)[0]).strip()
return title
def getStudio(html): # 获取厂商 已修改
try:
@@ -61,7 +61,6 @@ def getNum(url, number): # 获取番号
filename = unquote(urlparse(url).path)
# 裁剪文件名
result = filename[1:-5].upper().strip()
print(result)
# 移除中文
if result.upper() != number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
@@ -83,13 +82,15 @@ def getSerise(html): # 获取系列 已修改
return ''
def getTag(html): # 获取标签
return html.xpath('//div[@class="article-tags"]/a/text()')
def getTag(html, studio): # 获取标签
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
def getExtrafanart(html): # 获取剧照
return ''
def cutTags(tags):
actors = []
tags = []
@@ -109,13 +110,15 @@ def main(number):
html = etree.fromstring(htmlcode, etree.HTMLParser())
url = getUrl(html)
tags = getTag(html)
actor,tags = cutTags(tags);
studio = getStudio(html)
tags = getTag(html, studio)
#actor,tags = cutTags(tags) # 演员在tags中的位置不固定放弃尝试获取
actor = ''
dic = {
# 标题
'title': getTitle(html, number),
'title': getTitle(html),
# 制作商
'studio': getStudio(html),
'studio': studio,
# 年份
'year': getYear(html),
# 简介
@@ -143,7 +146,8 @@ def main(number):
'website': url,
'source': 'madou.py',
# 使用
'series': getSerise(html)
'series': getSerise(html),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -161,4 +165,11 @@ def main(number):
if __name__ == '__main__':
print(main('MD0094'))
config.getInstance().set_override("debug_mode:switch=1")
print(main('MD0129'))
# print(main('TM0002'))
# print(main('MD0222'))
# print(main('MD0140-2'))
# print(main('MAD039'))
# print(main('JDMY027'))

View File

@@ -5,95 +5,28 @@ from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
from WebCrawler.crawler import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a):
try:
html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
return result.replace('/', ',')
except:
return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+')
def getYear(getRelease):
try:
result = str(re.search('\d{4}',getRelease).group())
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace('/','-')
class MgsCrawler(Crawler):
def getMgsString(self, _xpath):
html = self.html
result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="EnlargeImage"]/@href')).strip(" ['']")
# result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
return result
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getExtrafanart(htmlcode): # 获取剧照
def getExtrafanart(htmlcode2): # 获取剧照
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
html = html_pather.search(htmlcode)
html = html_pather.search(htmlcode2)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
@@ -104,36 +37,35 @@ def getExtrafanart(htmlcode): # 获取剧照
def main(number2):
number=number2.upper()
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode, 'lxml')
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode2, 'lxml')
a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
htmlcode = MgsCrawler(htmlcode2)
a = MgsCrawler(a2)
b = MgsCrawler(b2)
#print(b)
try:
dic = {
'title': getTitle(htmlcode).replace("\\n", '').replace(' ', ''),
'studio': getStudio(a),
'outline': getOutline(b),
'runtime': getRuntime(a),
'director': getDirector(a),
'actor': getActor(a),
'release': getRelease(a),
'number': getNum(a),
'cover': getCover(htmlcode),
'imagecut': 1,
'tag': getTag(a),
'label': getLabel(a),
'extrafanart': getExtrafanart(htmlcode),
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
'source': 'mgstage.py',
'series': getSeries(a),
}
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
dic = {
'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace(' ', '').strip(),
'studio': a.getMgsString('//th[contains(text(),"メーカー:")]/../td/a/text()'),
'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
'runtime': a.getMgsString('//th[contains(text(),"収録時間:")]/../td/a/text()').rstrip('mi'),
'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
'actor': a.getMgsString('//th[contains(text(),"出演:")]/../td/a/text()'),
'release': a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()').replace('/','-'),
'number': a.getMgsString('//th[contains(text(),"品番:")]/../td/a/text()'),
'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
'imagecut': 1,
'tag': getTag(a2),
'label': a.getMgsString('//th[contains(text(),"シリーズ:")]/../td/a/text()'),
'extrafanart': getExtrafanart(htmlcode2),
'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()'))).strip(" ['']"),
# str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
'source': 'mgstage.py',
'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js

View File

@@ -5,7 +5,6 @@ import json
import builtins
from ADC_function import *
from lxml.html import fromstring
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher
from unicodedata import category
@@ -13,7 +12,7 @@ from number_parser import is_uncensored
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池','进程池')
G_mode_txt = ('顺序执行','线程池')
class noThread(object):
def map(self, fn, param):
@@ -25,14 +24,15 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title, sites: list=None):
def getStoryline(number, title, sites: list=None, 无码=None):
start_time = time.time()
conf = config.getInstance()
if not conf.is_storyline():
return ''
debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
if is_uncensored(number):
unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
if unc:
storyine_sites += conf.storyline_uncensored_site().split(',')
else:
storyine_sites += conf.storyline_censored_site().split(',')
@@ -49,9 +49,8 @@ def getStoryline(number, title, sites: list=None):
cores = min(len(apply_sites), os.cpu_count())
if cores == 0:
return ''
run_mode = conf.storyline_mode()
assert run_mode in (0,1,2)
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
run_mode = 1 if conf.storyline_mode() > 0 else 0
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args)
sel = ''
if not debug and conf.storyline_show() == 0:
@@ -62,7 +61,7 @@ def getStoryline(number, title, sites: list=None):
if not len(sel):
sel = value
return sel
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
# 以下debug结果输出会写入日志
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
sel_site = ''
for site, desc in zip(apply_sites, results):
@@ -80,34 +79,33 @@ def getStoryline(number, title, sites: list=None):
def getStoryline_mp(args):
def _inner(site, number, title, debug):
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
# 进程池模式的子进程getStoryline_*()的print()不会写入日志中,线程池和顺序执行不受影响
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
(site, number, title, debug) = args
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
return _inner(*args)
elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
#storyline = getStoryline_airavwiki_super(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug):
@@ -308,8 +306,8 @@ def getStoryline_amazon(q_title, number, debug):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug)
@@ -325,8 +323,9 @@ def getStoryline_amazon(q_title, number, debug):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
div = lx.xpath('//*[@id="productDescription"]')[0]
ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)])
p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
ama_t = ' '.join(p1) + ' '.join(p2)
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
if cookie is None:
@@ -406,10 +405,10 @@ def amazon_select_one(a_titles, q_title, number, debug):
# debug 模式下记录识别准确率日志
if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return -1
# 被采信的结果日志
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return sel

View File

@@ -128,7 +128,7 @@ def getOutline(html, number, title):
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site)
g = getStoryline(number, title, site, 无码=False)
if len(g):
return g
try: