Merge branch 'upstream'
# Conflicts: # WebCrawler/fanza.py
This commit is contained in:
@@ -24,6 +24,7 @@ from . import carib
|
||||
from . import fc2club
|
||||
from . import mv91
|
||||
from . import madou
|
||||
from . import gcolle
|
||||
|
||||
|
||||
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
|
||||
@@ -62,7 +63,8 @@ def get_data_from_json(file_number, oCC):
|
||||
"carib": carib.main,
|
||||
"fc2club": fc2club.main,
|
||||
"mv91": mv91.main,
|
||||
"madou": madou.main
|
||||
"madou": madou.main,
|
||||
"gcolle": gcolle.main,
|
||||
}
|
||||
|
||||
conf = config.getInstance()
|
||||
@@ -91,6 +93,8 @@ def get_data_from_json(file_number, oCC):
|
||||
sources.insert(0, sources.pop(sources.index("fc2")))
|
||||
if "fc2club" in sources:
|
||||
sources.insert(0, sources.pop(sources.index("fc2club")))
|
||||
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
|
||||
sources.insert(0, sources.pop(sources.index("gcolle")))
|
||||
elif "dlsite" in sources and (
|
||||
"rj" in lo_file_number or "vj" in lo_file_number
|
||||
):
|
||||
@@ -100,6 +104,12 @@ def get_data_from_json(file_number, oCC):
|
||||
sources.insert(0, sources.pop(sources.index("javdb")))
|
||||
if "xcity" in sources:
|
||||
sources.insert(0, sources.pop(sources.index("xcity")))
|
||||
if "madou" in sources:
|
||||
sources.insert(0, sources.pop(sources.index("madou")))
|
||||
elif "madou" in sources and (
|
||||
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
|
||||
):
|
||||
sources.insert(0, sources.pop(sources.index("madou")))
|
||||
|
||||
# check sources in func_mapping
|
||||
todel = []
|
||||
@@ -124,7 +134,10 @@ def get_data_from_json(file_number, oCC):
|
||||
for source in sources:
|
||||
if conf.debug() == True:
|
||||
print('[+]select', source)
|
||||
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
|
||||
try:
|
||||
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
|
||||
except:
|
||||
json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
|
||||
# if any service return a valid return, break
|
||||
if get_data_state(json_data):
|
||||
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
|
||||
@@ -136,7 +149,10 @@ def get_data_from_json(file_number, oCC):
|
||||
try:
|
||||
if conf.debug() == True:
|
||||
print('[+]select', source)
|
||||
json_data = json.loads(func_mapping[source](file_number))
|
||||
try:
|
||||
json_data = json.loads(func_mapping[source](file_number))
|
||||
except:
|
||||
json_data = func_mapping[source](file_number)
|
||||
# if any service return a valid return, break
|
||||
if get_data_state(json_data):
|
||||
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
|
||||
@@ -242,8 +258,8 @@ def get_data_from_json(file_number, oCC):
|
||||
if json_data[translate_value] == "":
|
||||
continue
|
||||
if translate_value == "title":
|
||||
title_dict = json.load(
|
||||
open(str(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json'), 'r', encoding="utf-8"))
|
||||
title_dict = json.loads(
|
||||
(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
|
||||
try:
|
||||
json_data[translate_value] = title_dict[number]
|
||||
continue
|
||||
|
||||
@@ -5,6 +5,7 @@ from lxml import etree
|
||||
import json
|
||||
from ADC_function import *
|
||||
from WebCrawler.storyline import getStoryline
|
||||
from WebCrawler.crawler import *
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
@@ -17,95 +18,64 @@ def getActorPhoto(html):
|
||||
p2 = {t: l}
|
||||
d.update(p2)
|
||||
return d
|
||||
def getTitle(html):
|
||||
try:
|
||||
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
|
||||
return result.replace('/', '')
|
||||
except:
|
||||
return ''
|
||||
|
||||
def getActor(html):
|
||||
a = html.xpath('//a[@class="avatar-box"]')
|
||||
d = []
|
||||
for i in a:
|
||||
d.append(i.find('span').text)
|
||||
return d
|
||||
def getStudio(html):
|
||||
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
|
||||
return result1
|
||||
def getRuntime(html):
|
||||
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
|
||||
return result1
|
||||
def getLabel(html):
|
||||
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
|
||||
return result1
|
||||
def getNum(html):
|
||||
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
|
||||
return result1
|
||||
def getYear(release):
|
||||
try:
|
||||
result = str(re.search('\d{4}',release).group())
|
||||
return result
|
||||
except:
|
||||
return release
|
||||
def getRelease(html):
|
||||
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
|
||||
return result1
|
||||
def getCover(html):
|
||||
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
|
||||
return result
|
||||
|
||||
def getCover_small(html):
|
||||
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
|
||||
return result
|
||||
def getTag(html):
|
||||
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
|
||||
return [i.strip() for i in x[2:]] if len(x) > 2 else []
|
||||
def getSeries(html):
|
||||
try:
|
||||
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
|
||||
return result1
|
||||
except:
|
||||
return ''
|
||||
|
||||
def main(number):
|
||||
html = get_html('https://tellme.pw/avsox')
|
||||
site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
|
||||
site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
|
||||
a = get_html(site + '/cn/search/' + number)
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
||||
html = Crawler(a)
|
||||
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
|
||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
||||
html = Crawler(a)
|
||||
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
|
||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||
a = get_html(site + '/cn/search/' + number.replace('_', ''))
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
||||
html = Crawler(a)
|
||||
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
|
||||
detail = get_html("https:" + result1)
|
||||
lx = etree.fromstring(detail, etree.HTMLParser())
|
||||
avsox_crawler2 = Crawler(a)
|
||||
avsox_crawler = Crawler(detail)
|
||||
try:
|
||||
new_number = getNum(lx)
|
||||
new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
|
||||
if new_number.upper() != number.upper():
|
||||
raise ValueError('number not found')
|
||||
title = getTitle(lx).strip(new_number)
|
||||
title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
|
||||
dic = {
|
||||
'actor': getActor(lx),
|
||||
'title': title,
|
||||
'studio': getStudio(lx),
|
||||
'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
|
||||
'outline': getStoryline(number, title),
|
||||
'runtime': getRuntime(lx),
|
||||
'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
|
||||
'director': '', #
|
||||
'release': getRelease(lx),
|
||||
'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
|
||||
'number': new_number,
|
||||
'cover': getCover(lx),
|
||||
'cover_small': getCover_small(html),
|
||||
'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
|
||||
#'cover_small' : getCover_small(html),
|
||||
'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
|
||||
'imagecut': 3,
|
||||
'tag': getTag(lx),
|
||||
'label': getLabel(lx),
|
||||
'year': getYear(getRelease(lx)),
|
||||
'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
|
||||
'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
|
||||
'actor_photo': getActorPhoto(lx),
|
||||
'website': "https:" + result1,
|
||||
'source': 'avsox.py',
|
||||
'series': getSeries(lx),
|
||||
'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
|
||||
}
|
||||
except Exception as e:
|
||||
if config.getInstance().debug():
|
||||
|
||||
@@ -40,6 +40,7 @@ def main(number: str) -> json:
|
||||
'website': f'{G_SITE}/moviepages/{number}/index.html',
|
||||
'source': 'carib.py',
|
||||
'series': get_series(lx),
|
||||
'无码': True
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
||||
return js
|
||||
@@ -59,7 +60,7 @@ def get_year(lx: html.HtmlElement) -> str:
|
||||
|
||||
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
|
||||
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
|
||||
g = getStoryline(number, title)
|
||||
g = getStoryline(number, title, 无码=True)
|
||||
if len(g):
|
||||
return g
|
||||
return o
|
||||
|
||||
28
WebCrawler/crawler.py
Normal file
28
WebCrawler/crawler.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from lxml import etree
|
||||
|
||||
class Crawler:
|
||||
def __init__(self,htmlcode):
|
||||
self.html = etree.HTML(htmlcode)
|
||||
|
||||
def getString(self,_xpath):
|
||||
if _xpath == "":
|
||||
return ""
|
||||
result = self.html.xpath(_xpath)
|
||||
try:
|
||||
return result[0]
|
||||
except:
|
||||
return ""
|
||||
|
||||
def getStrings(self,_xpath):
|
||||
result = self.html.xpath(_xpath)
|
||||
try:
|
||||
return result
|
||||
except:
|
||||
return ""
|
||||
|
||||
def getOutline(self,_xpath):
|
||||
result = self.html.xpath(_xpath)
|
||||
try:
|
||||
return "\n".join(result)
|
||||
except:
|
||||
return ""
|
||||
@@ -1,15 +1,14 @@
|
||||
import re
|
||||
from lxml import etree
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
from ADC_function import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html'))
|
||||
#title //*[@id="work_name"]/a/text()
|
||||
#print(get_html('https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html'))
|
||||
#title /html/head/title/text()
|
||||
#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
|
||||
#release //th[contains(text(),"販売日")]/../td/a/text()
|
||||
#story //th[contains(text(),"シナリオ")]/../td/a/text()
|
||||
@@ -18,14 +17,14 @@ from ADC_function import *
|
||||
#jianjie //*[@id="main_inner"]/div[3]/text()
|
||||
#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src
|
||||
|
||||
#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html
|
||||
#https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html
|
||||
|
||||
def getTitle(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="work_name"]/a/text()')[0]
|
||||
def getTitle(html):
|
||||
result = str(html.xpath('/html/head/title/text()')[0])
|
||||
result = result[:result.rfind(' | DLsite')]
|
||||
result = result[:result.rfind(' [')]
|
||||
return result
|
||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
try:
|
||||
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
|
||||
except:
|
||||
@@ -38,8 +37,7 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
||||
p={i:''}
|
||||
d.update(p)
|
||||
return d
|
||||
def getStudio(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getStudio(html):
|
||||
try:
|
||||
try:
|
||||
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
||||
@@ -53,8 +51,7 @@ def getRuntime(a):
|
||||
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').rstrip('mi')
|
||||
def getLabel(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getLabel(html):
|
||||
try:
|
||||
try:
|
||||
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
||||
@@ -69,12 +66,10 @@ def getYear(getRelease):
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getRelease(html):
|
||||
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
|
||||
return result1.replace('年','-').replace('月','-').replace('日','')
|
||||
def getTag(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getTag(html):
|
||||
try:
|
||||
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
|
||||
return result
|
||||
@@ -96,26 +91,22 @@ def getCover_small(a, index=0):
|
||||
if not 'https' in result:
|
||||
result = 'https:' + result
|
||||
return result
|
||||
def getCover(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0]
|
||||
return result
|
||||
def getDirector(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getCover(html):
|
||||
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
|
||||
return result.replace('.webp', '.jpg')
|
||||
def getDirector(html):
|
||||
try:
|
||||
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
|
||||
except:
|
||||
result = ''
|
||||
return result
|
||||
def getOutline(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
def getOutline(html):
|
||||
total = []
|
||||
result = html.xpath('//*[@id="main_inner"]/div[3]/text()')
|
||||
result = html.xpath('//*[@class="work_parts_area"]/p/text()')
|
||||
for i in result:
|
||||
total.append(i.strip('\r\n'))
|
||||
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
||||
def getSeries(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getSeries(html):
|
||||
try:
|
||||
try:
|
||||
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
||||
@@ -127,28 +118,28 @@ def getSeries(a):
|
||||
def main(number):
|
||||
try:
|
||||
number = number.upper()
|
||||
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
|
||||
htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN',
|
||||
cookies={'locale': 'zh-cn'})
|
||||
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
dic = {
|
||||
'actor': getActor(htmlcode),
|
||||
'title': getTitle(htmlcode),
|
||||
'studio': getStudio(htmlcode),
|
||||
'outline': getOutline(htmlcode),
|
||||
'actor': getActor(html),
|
||||
'title': getTitle(html),
|
||||
'studio': getStudio(html),
|
||||
'outline': getOutline(html),
|
||||
'runtime': '',
|
||||
'director': getDirector(htmlcode),
|
||||
'release': getRelease(htmlcode),
|
||||
'director': getDirector(html),
|
||||
'release': getRelease(html),
|
||||
'number': number,
|
||||
'cover': 'https:' + getCover(htmlcode),
|
||||
'cover': 'https:' + getCover(html),
|
||||
'cover_small': '',
|
||||
'imagecut': 0,
|
||||
'tag': getTag(htmlcode),
|
||||
'label': getLabel(htmlcode),
|
||||
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'tag': getTag(html),
|
||||
'label': getLabel(html),
|
||||
'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': '',
|
||||
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
|
||||
'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
|
||||
'source': 'dlsite.py',
|
||||
'series': getSeries(htmlcode),
|
||||
'series': getSeries(html),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
@@ -166,4 +157,6 @@ def main(number):
|
||||
# main('DV-1562')
|
||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
||||
if __name__ == "__main__":
|
||||
config.getInstance().set_override("debug_mode:switch=1")
|
||||
print(main('VJ013178'))
|
||||
print(main('RJ329607'))
|
||||
|
||||
@@ -9,130 +9,33 @@ from urllib.parse import urlencode
|
||||
from lxml import etree
|
||||
|
||||
from ADC_function import *
|
||||
|
||||
from WebCrawler.crawler import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
class fanzaCrawler(Crawler):
|
||||
def getFanzaString(self,string):
|
||||
result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
|
||||
result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
|
||||
return result1+result2
|
||||
|
||||
def getTitle(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
result = html.xpath('//*[starts-with(@id, "title")]/text()')[0]
|
||||
return result
|
||||
def getFanzaStrings(self, string):
|
||||
result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
|
||||
if len(result1) > 0:
|
||||
return result1
|
||||
result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
|
||||
return result2
|
||||
|
||||
|
||||
def getActor(text):
|
||||
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
result = (
|
||||
str(
|
||||
html.xpath(
|
||||
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||
)
|
||||
)
|
||||
.strip(" ['']")
|
||||
.replace("', '", ",")
|
||||
)
|
||||
return result
|
||||
def getRelease(fanza_Crawler):
|
||||
result = fanza_Crawler.getFanzaString('発売日:')
|
||||
if result == '----':
|
||||
result = fanza_Crawler.getFanzaString('配信開始日:')
|
||||
return result.replace("/", "-").strip('\\n')
|
||||
|
||||
|
||||
def getStudio(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getRuntime(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
|
||||
return re.search(r"\d+", str(result)).group()
|
||||
|
||||
|
||||
def getLabel(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'レーベル:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getNum(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getYear(getRelease):
|
||||
try:
|
||||
result = str(re.search(r"\d{4}", getRelease).group())
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
|
||||
|
||||
def getRelease(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
result = "----"
|
||||
if result == "----":
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'配信開始日:')]/following-sibling::td/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
pass
|
||||
return result.replace("/", "-")
|
||||
|
||||
|
||||
def getTag(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
|
||||
)
|
||||
return result
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def getCover(text, number):
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
def getCover(html, number):
|
||||
cover_number = number
|
||||
try:
|
||||
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
||||
@@ -151,29 +54,11 @@ def getCover(text, number):
|
||||
return result
|
||||
|
||||
|
||||
def getDirector(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
def getOutline(html):
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getOutline(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
try:
|
||||
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
|
||||
"\n", ""
|
||||
)
|
||||
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
|
||||
if result == "":
|
||||
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
|
||||
"\n", ""
|
||||
)
|
||||
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
|
||||
except:
|
||||
# (TODO) handle more edge case
|
||||
# print(html)
|
||||
@@ -181,23 +66,8 @@ def getOutline(text):
|
||||
return result
|
||||
|
||||
|
||||
def getSeries(text):
|
||||
try:
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
except:
|
||||
return ""
|
||||
|
||||
def getExtrafanart(htmlcode): # 获取剧照
|
||||
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
|
||||
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div></div>')
|
||||
html = html_pather.search(htmlcode)
|
||||
if html:
|
||||
html = html.group()
|
||||
@@ -232,6 +102,7 @@ def main(number):
|
||||
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
|
||||
]
|
||||
chosen_url = ""
|
||||
fanza_Crawler = ''
|
||||
|
||||
for url in fanza_urls:
|
||||
chosen_url = url + fanza_search_number
|
||||
@@ -240,6 +111,7 @@ def main(number):
|
||||
urlencode({"rurl": chosen_url})
|
||||
)
|
||||
)
|
||||
fanza_Crawler = fanzaCrawler(htmlcode)
|
||||
if "404 Not Found" not in htmlcode:
|
||||
break
|
||||
if "404 Not Found" in htmlcode:
|
||||
@@ -249,28 +121,34 @@ def main(number):
|
||||
# for example, the url will be cid=test012
|
||||
# but the hinban on the page is test00012
|
||||
# so get the hinban first, and then pass it to following functions
|
||||
fanza_hinban = getNum(htmlcode)
|
||||
fanza_hinban = fanza_Crawler.getFanzaString('品番:')
|
||||
out_num = fanza_hinban
|
||||
number_lo = number.lower()
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
if (re.sub('-|_', '', number_lo) == fanza_hinban or
|
||||
number_lo.replace('-', '00') == fanza_hinban or
|
||||
number_lo.replace('-', '') + 'so' == fanza_hinban
|
||||
):
|
||||
out_num = number
|
||||
data = {
|
||||
"title": getTitle(htmlcode).strip(),
|
||||
"studio": getStudio(htmlcode),
|
||||
"outline": getOutline(htmlcode),
|
||||
"runtime": getRuntime(htmlcode),
|
||||
"director": getDirector(htmlcode) if "anime" not in chosen_url else "",
|
||||
"actor": getActor(htmlcode) if "anime" not in chosen_url else "",
|
||||
"release": getRelease(htmlcode),
|
||||
"number": fanza_hinban,
|
||||
"cover": getCover(htmlcode, fanza_hinban),
|
||||
"title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
|
||||
"studio": fanza_Crawler.getFanzaString('メーカー'),
|
||||
"outline": getOutline(html),
|
||||
"runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
|
||||
"director": fanza_Crawler.getFanzaString('監督:') if "anime" not in chosen_url else "",
|
||||
"actor": fanza_Crawler.getString("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()").replace("', '", ",") if "anime" not in chosen_url else "",
|
||||
"release": getRelease(fanza_Crawler),
|
||||
"number": out_num,
|
||||
"cover": getCover(html, fanza_hinban),
|
||||
"imagecut": 1,
|
||||
"tag": getTag(htmlcode),
|
||||
"tag": fanza_Crawler.getFanzaStrings('ジャンル:'),
|
||||
"extrafanart": getExtrafanart(htmlcode),
|
||||
"label": getLabel(htmlcode),
|
||||
"year": getYear(
|
||||
getRelease(htmlcode)
|
||||
), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
"label": fanza_Crawler.getFanzaString('レーベル'),
|
||||
"year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
"actor_photo": "",
|
||||
"website": chosen_url,
|
||||
"source": "fanza.py",
|
||||
"series": getSeries(htmlcode),
|
||||
"series": fanza_Crawler.getFanzaString('シリーズ:'),
|
||||
}
|
||||
except:
|
||||
data = {
|
||||
@@ -314,4 +192,6 @@ def main_htmlcode(number):
|
||||
if __name__ == "__main__":
|
||||
# print(main("DV-1562"))
|
||||
# print(main("96fad1217"))
|
||||
print(main("h_173ghmt68"))
|
||||
print(main("pred00251"))
|
||||
print(main("MIAA-391"))
|
||||
print(main("OBA-326"))
|
||||
|
||||
@@ -4,58 +4,11 @@ import re
|
||||
from lxml import etree#need install
|
||||
import json
|
||||
import ADC_function
|
||||
from WebCrawler.crawler import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
def getTitle_fc2com(htmlcode): #获取厂商
|
||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
|
||||
return result
|
||||
def getActor_fc2com(htmlcode):
|
||||
try:
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
def getStudio_fc2com(htmlcode): #获取厂商
|
||||
try:
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
def getNum_fc2com(htmlcode): #获取番号
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getRelease_fc2com(htmlcode2): #
|
||||
html=etree.fromstring(htmlcode2,etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()')).strip(" ['販売日 : ']").replace('/','-')
|
||||
return result
|
||||
def getCover_fc2com(htmlcode2): #获取厂商 #
|
||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
|
||||
return 'http:' + result
|
||||
# def getOutline_fc2com(htmlcode2): #获取番号 #
|
||||
# xpath_html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
||||
# path = str(xpath_html.xpath('//*[@id="top"]/div[1]/section[4]/iframe/@src')).strip(" ['']")
|
||||
# html = etree.fromstring(ADC_function.get_html('https://adult.contents.fc2.com/'+path), etree.HTMLParser())
|
||||
# print('https://adult.contents.fc2.com'+path)
|
||||
# print(ADC_function.get_html('https://adult.contents.fc2.com'+path,cookies={'wei6H':'1'}))
|
||||
# result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
|
||||
# return result
|
||||
def getTag_fc2com(lx):
|
||||
result = lx.xpath("//a[@class='tag tagTag']/text()")
|
||||
return result
|
||||
def getYear_fc2com(release):
|
||||
try:
|
||||
result = re.search('\d{4}',release).group()
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
def getExtrafanart(htmlcode): # 获取剧照
|
||||
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
|
||||
html = html_pather.search(htmlcode)
|
||||
@@ -79,27 +32,30 @@ def getTrailer(htmlcode, number):
|
||||
except:
|
||||
return ''
|
||||
else:
|
||||
video_url = ''
|
||||
return ''
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
number = number.replace('FC2-', '').replace('fc2-', '')
|
||||
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/')
|
||||
actor = getActor_fc2com(htmlcode2)
|
||||
if not actor:
|
||||
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
|
||||
fc2_crawler = Crawler(htmlcode2)
|
||||
actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
|
||||
if actor == "":
|
||||
actor = '素人'
|
||||
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
|
||||
cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
|
||||
cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
|
||||
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
|
||||
release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
|
||||
strip(" ['販売日 : ']").replace('/','-')
|
||||
dic = {
|
||||
'title': lx.xpath('/html/head/title/text()')[0],
|
||||
'studio': getStudio_fc2com(htmlcode2),
|
||||
'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
|
||||
'title': fc2_crawler.getString('/html/head/title/text()'),
|
||||
'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
|
||||
'year': re.findall('\d{4}',release)[0],
|
||||
'outline': '', # getOutline_fc2com(htmlcode2),
|
||||
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
|
||||
'director': getStudio_fc2com(htmlcode2),
|
||||
'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
|
||||
'actor': actor,
|
||||
'release': getRelease_fc2com(htmlcode2),
|
||||
'release': release,
|
||||
'number': 'FC2-' + number,
|
||||
'label': '',
|
||||
'cover': cover,
|
||||
@@ -107,7 +63,7 @@ def main(number):
|
||||
'extrafanart': getExtrafanart(htmlcode2),
|
||||
"trailer": getTrailer(htmlcode2, number),
|
||||
'imagecut': 0,
|
||||
'tag': getTag_fc2com(lx),
|
||||
'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
|
||||
'actor_photo': '',
|
||||
'website': 'https://adult.contents.fc2.com/article/' + number + '/',
|
||||
'source': 'https://adult.contents.fc2.com/article/' + number + '/',
|
||||
@@ -121,6 +77,4 @@ def main(number):
|
||||
return js
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('FC2-1787685'))
|
||||
print(main('FC2-2086710'))
|
||||
|
||||
print(main('FC2-2182382'))
|
||||
88
WebCrawler/gcolle.py
Normal file
88
WebCrawler/gcolle.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
|
||||
from WebCrawler.crawler import *
|
||||
from ADC_function import *
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def main(number):
|
||||
save_cookies = False
|
||||
cookie_filename = 'gcolle.json'
|
||||
try:
|
||||
gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
|
||||
session = get_html_session(cookies=gcolle_cooikes)
|
||||
number = number.upper().replace('GCOLLE-','')
|
||||
|
||||
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
|
||||
gcolle_crawler = Crawler(htmlcode)
|
||||
r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
|
||||
if r18_continue and r18_continue.startswith('http'):
|
||||
htmlcode = session.get(r18_continue).text
|
||||
gcolle_crawler = Crawler(htmlcode)
|
||||
save_cookies = True
|
||||
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
|
||||
|
||||
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
|
||||
if number != number_html:
|
||||
raise Exception('[-]gcolle.py: number not match')
|
||||
|
||||
if save_cookies:
|
||||
cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
|
||||
cookies_save.parent.mkdir(parents=True, exist_ok=True)
|
||||
cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
|
||||
|
||||
# get extrafanart url
|
||||
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
|
||||
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
|
||||
else:
|
||||
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
|
||||
# Add "https:" in each extrafanart url
|
||||
for i in range(len(extrafanart)):
|
||||
extrafanart[i] = 'https:' + extrafanart[i]
|
||||
|
||||
dic = {
|
||||
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
|
||||
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
|
||||
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
|
||||
"runtime": '',
|
||||
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
|
||||
"number": "GCOLLE-" + str(number_html),
|
||||
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
|
||||
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
|
||||
"trailer": '',
|
||||
"actor_photo":'',
|
||||
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
|
||||
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
|
||||
"extrafanart":extrafanart,
|
||||
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
|
||||
"source": 'gcolle.py',
|
||||
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||
'无码': False,
|
||||
}
|
||||
# for k,v in dic.items():
|
||||
# if k == 'outline':
|
||||
# print(k,len(v))
|
||||
# else:
|
||||
# print(k,v)
|
||||
# print('===============================================================')
|
||||
except Exception as e:
|
||||
dic = {'title':''}
|
||||
if config.getInstance().debug():
|
||||
print(e)
|
||||
|
||||
return dic
|
||||
|
||||
if __name__ == '__main__':
|
||||
from pprint import pprint
|
||||
config.getInstance().set_override("debug_mode:switch=1")
|
||||
pprint(main('840724'))
|
||||
pprint(main('840386'))
|
||||
pprint(main('838671'))
|
||||
pprint(main('814179'))
|
||||
pprint(main('834255'))
|
||||
pprint(main('814179'))
|
||||
@@ -56,9 +56,9 @@ def parse_info(soup: BeautifulSoup) -> dict:
|
||||
"label": get_label(data_dic),
|
||||
"studio": get_studio(data_dic),
|
||||
"tag": get_tag(data_dic),
|
||||
"number": get_number(data_dic),
|
||||
"number": get_number(data_dic).upper(),
|
||||
"release": get_release(data_dic),
|
||||
"runtime": get_runtime(data_dic),
|
||||
"runtime": get_runtime(data_dic).replace(" minutes", ""),
|
||||
"series": get_series(data_dic),
|
||||
}
|
||||
else:
|
||||
|
||||
@@ -60,10 +60,10 @@ def getCID(html):
|
||||
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
|
||||
result = re.sub('/.*?.jpg','',string)
|
||||
return result
|
||||
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
||||
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
|
||||
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
|
||||
return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度
|
||||
return getStoryline(number,title)
|
||||
return getStoryline(number,title, 无码=uncensored)
|
||||
def getSeriseJa(html):
|
||||
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
|
||||
return str(x[0]) if len(x) else ''
|
||||
@@ -83,9 +83,13 @@ def getExtrafanart(htmlcode): # 获取剧照
|
||||
if extrafanart_imgs:
|
||||
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
|
||||
return ''
|
||||
def getUncensored(html):
|
||||
x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
|
||||
return bool(x)
|
||||
|
||||
def main_uncensored(number):
|
||||
htmlcode = get_html('https://www.javbus.com/ja/' + number)
|
||||
w_number = number.replace('.', '-')
|
||||
htmlcode = get_html('https://www.javbus.red/' + w_number)
|
||||
if "<title>404 Page Not Found" in htmlcode:
|
||||
raise Exception('404 page not found')
|
||||
lx = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
@@ -94,7 +98,7 @@ def main_uncensored(number):
|
||||
'title': title,
|
||||
'studio': getStudioJa(lx),
|
||||
'year': getYear(lx),
|
||||
'outline': getOutline(number, title),
|
||||
'outline': getOutline(w_number, title, True),
|
||||
'runtime': getRuntime(lx),
|
||||
'director': getDirectorJa(lx),
|
||||
'actor': getActor(lx),
|
||||
@@ -106,9 +110,10 @@ def main_uncensored(number):
|
||||
'label': getSeriseJa(lx),
|
||||
'imagecut': 0,
|
||||
# 'actor_photo': '',
|
||||
'website': 'https://www.javbus.com/ja/' + number,
|
||||
'website': 'https://www.javbus.red/' + w_number,
|
||||
'source': 'javbus.py',
|
||||
'series': getSeriseJa(lx),
|
||||
'无码': True
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
@@ -136,7 +141,7 @@ def main(number):
|
||||
'title': title,
|
||||
'studio': getStudio(lx),
|
||||
'year': getYear(lx),
|
||||
'outline': getOutline(number, title),
|
||||
'outline': getOutline(number, title, getUncensored(lx)),
|
||||
'runtime': getRuntime(lx),
|
||||
'director': getDirector(lx),
|
||||
'actor': getActor(lx),
|
||||
@@ -151,6 +156,7 @@ def main(number):
|
||||
'website': 'https://www.javbus.com/' + number,
|
||||
'source': 'javbus.py',
|
||||
'series': getSerise(lx),
|
||||
'无码': getUncensored(lx)
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
@@ -168,13 +174,14 @@ def main(number):
|
||||
return js
|
||||
|
||||
if __name__ == "__main__" :
|
||||
config.G_conf_override['debug_mode:switch'] = True
|
||||
print(main('ABP-888'))
|
||||
print(main('ABP-960'))
|
||||
print(main('ADV-R0624')) # 404
|
||||
print(main('MMNT-010'))
|
||||
print(main('ipx-292'))
|
||||
print(main('CEMD-011'))
|
||||
print(main('CJOD-278'))
|
||||
config.getInstance().set_override("debug_mode:switch=1")
|
||||
# print(main('ABP-888'))
|
||||
# print(main('ABP-960'))
|
||||
# print(main('ADV-R0624')) # 404
|
||||
# print(main('MMNT-010'))
|
||||
# print(main('ipx-292'))
|
||||
# print(main('CEMD-011'))
|
||||
# print(main('CJOD-278'))
|
||||
print(main('BrazzersExxtra.21.02.01'))
|
||||
print(main('100221_001'))
|
||||
print(main('AVSW-061'))
|
||||
|
||||
@@ -166,12 +166,23 @@ def getDirector(html):
|
||||
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
||||
return getStoryline(number,title)
|
||||
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
|
||||
return getStoryline(number, title, 无码=uncensored)
|
||||
def getSeries(html):
|
||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
def getUserRating(html):
|
||||
try:
|
||||
result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
|
||||
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
||||
return float(v[0][0]), int(v[0][1])
|
||||
except:
|
||||
return
|
||||
def getUncensored(html):
|
||||
x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
|
||||
' or contains(@href,"/tags/western?")]')
|
||||
return bool(x)
|
||||
|
||||
def main(number):
|
||||
# javdb更新后同一时间只能登录一个数字站,最新登录站会踢出旧的登录,因此按找到的第一个javdb*.json文件选择站点,
|
||||
@@ -276,7 +287,7 @@ def main(number):
|
||||
'actor': getActor(lx),
|
||||
'title': title,
|
||||
'studio': getStudio(detail_page, lx),
|
||||
'outline': getOutline(number, title),
|
||||
'outline': getOutline(number, title, getUncensored(lx)),
|
||||
'runtime': getRuntime(lx),
|
||||
'director': getDirector(lx),
|
||||
'release': getRelease(detail_page),
|
||||
@@ -293,8 +304,12 @@ def main(number):
|
||||
'website': urljoin('https://javdb.com', correct_url),
|
||||
'source': 'javdb.py',
|
||||
'series': getSeries(lx),
|
||||
|
||||
'无码': getUncensored(lx)
|
||||
}
|
||||
userrating = getUserRating(lx)
|
||||
if isinstance(userrating, tuple) and len(userrating) == 2:
|
||||
dic['用户评分'] = userrating[0]
|
||||
dic['评分人数'] = userrating[1]
|
||||
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
|
||||
dic['actor'].append('素人')
|
||||
if not dic['series']:
|
||||
@@ -313,18 +328,19 @@ def main(number):
|
||||
# main('DV-1562')
|
||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
||||
if __name__ == "__main__":
|
||||
config.G_conf_override['debug_mode:switch'] = True
|
||||
config.getInstance().set_override("debug_mode:switch=1")
|
||||
# print(main('blacked.20.05.30'))
|
||||
# print(main('AGAV-042'))
|
||||
# print(main('BANK-022'))
|
||||
# print(main('070116-197'))
|
||||
print(main('070116-197'))
|
||||
# print(main('093021_539')) # 没有剧照 片商pacopacomama
|
||||
#print(main('FC2-2278260'))
|
||||
# print(main('FC2-735670'))
|
||||
# print(main('FC2-1174949')) # not found
|
||||
#print(main('MVSD-439'))
|
||||
# print(main('EHM0001')) # not found
|
||||
print(main('FC2-2314275'))
|
||||
#print(main('FC2-2314275'))
|
||||
# print(main('EBOD-646'))
|
||||
# print(main('LOVE-262'))
|
||||
#print(main('ABP-890'))
|
||||
print(main('ABP-890'))
|
||||
print(main('blacked.14.12.08'))
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
from bs4 import BeautifulSoup # need install
|
||||
from lxml import etree # need install
|
||||
from pyquery import PyQuery as pq # need install
|
||||
@@ -5,24 +7,22 @@ from ADC_function import *
|
||||
import json
|
||||
import re
|
||||
from lib2to3.pgen2 import parse
|
||||
import sys
|
||||
|
||||
from urllib.parse import urlparse, unquote
|
||||
sys.path.append('../')
|
||||
|
||||
|
||||
def getActorPhoto(html):
|
||||
return ''
|
||||
|
||||
|
||||
def getTitle(html, number): # 获取标题
|
||||
title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
|
||||
try:
|
||||
result = str(re.split(r'[/|/|-]', title)[1])
|
||||
return result.strip()
|
||||
except:
|
||||
return title.replace(number.upper(), '').strip()
|
||||
|
||||
def getTitle(html): # 获取标题
|
||||
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
|
||||
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
|
||||
# <title>MD0094/贫嘴贱舌中出大嫂/坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
|
||||
# <title>TM0002-我的痴女女友-麻豆社</title>
|
||||
browser_title = str(html.xpath("/html/head/title/text()")[0])
|
||||
title = str(re.findall(r'^[A-Z0-9 //\-]*(.*)-麻豆社$', browser_title)[0]).strip()
|
||||
return title
|
||||
|
||||
def getStudio(html): # 获取厂商 已修改
|
||||
try:
|
||||
@@ -61,7 +61,6 @@ def getNum(url, number): # 获取番号
|
||||
filename = unquote(urlparse(url).path)
|
||||
# 裁剪文件名
|
||||
result = filename[1:-5].upper().strip()
|
||||
print(result)
|
||||
# 移除中文
|
||||
if result.upper() != number.upper():
|
||||
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
|
||||
@@ -83,13 +82,15 @@ def getSerise(html): # 获取系列 已修改
|
||||
return ''
|
||||
|
||||
|
||||
def getTag(html): # 获取标签
|
||||
return html.xpath('//div[@class="article-tags"]/a/text()')
|
||||
def getTag(html, studio): # 获取标签
|
||||
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
|
||||
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
|
||||
|
||||
|
||||
def getExtrafanart(html): # 获取剧照
|
||||
return ''
|
||||
|
||||
|
||||
def cutTags(tags):
|
||||
actors = []
|
||||
tags = []
|
||||
@@ -109,13 +110,15 @@ def main(number):
|
||||
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
url = getUrl(html)
|
||||
tags = getTag(html)
|
||||
actor,tags = cutTags(tags);
|
||||
studio = getStudio(html)
|
||||
tags = getTag(html, studio)
|
||||
#actor,tags = cutTags(tags) # 演员在tags中的位置不固定,放弃尝试获取
|
||||
actor = ''
|
||||
dic = {
|
||||
# 标题
|
||||
'title': getTitle(html, number),
|
||||
'title': getTitle(html),
|
||||
# 制作商
|
||||
'studio': getStudio(html),
|
||||
'studio': studio,
|
||||
# 年份
|
||||
'year': getYear(html),
|
||||
# 简介
|
||||
@@ -143,7 +146,8 @@ def main(number):
|
||||
'website': url,
|
||||
'source': 'madou.py',
|
||||
# 使用
|
||||
'series': getSerise(html)
|
||||
'series': getSerise(html),
|
||||
'无码': True
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
|
||||
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
@@ -161,4 +165,11 @@ def main(number):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('MD0094'))
|
||||
config.getInstance().set_override("debug_mode:switch=1")
|
||||
print(main('MD0129'))
|
||||
# print(main('TM0002'))
|
||||
# print(main('MD0222'))
|
||||
# print(main('MD0140-2'))
|
||||
# print(main('MAD039'))
|
||||
# print(main('JDMY027'))
|
||||
|
||||
|
||||
@@ -5,95 +5,28 @@ from lxml import etree
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from ADC_function import *
|
||||
from WebCrawler.crawler import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
def getTitle(a):
|
||||
try:
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
|
||||
return result.replace('/', ',')
|
||||
except:
|
||||
return ''
|
||||
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
|
||||
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
|
||||
def getStudio(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
|
||||
result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
|
||||
def getRuntime(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
return str(result1 + result2).strip('+').rstrip('mi')
|
||||
def getLabel(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||
def getNum(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+')
|
||||
def getYear(getRelease):
|
||||
try:
|
||||
result = str(re.search('\d{4}',getRelease).group())
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace('/','-')
|
||||
class MgsCrawler(Crawler):
|
||||
def getMgsString(self, _xpath):
|
||||
html = self.html
|
||||
result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
||||
result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||
|
||||
def getTag(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
||||
return result
|
||||
def getCover(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="EnlargeImage"]/@href')).strip(" ['']")
|
||||
# result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
|
||||
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
|
||||
return result
|
||||
def getDirector(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||
def getOutline(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
||||
return result
|
||||
def getSeries(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
|
||||
def getExtrafanart(htmlcode): # 获取剧照
|
||||
def getExtrafanart(htmlcode2): # 获取剧照
|
||||
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
|
||||
html = html_pather.search(htmlcode)
|
||||
html = html_pather.search(htmlcode2)
|
||||
if html:
|
||||
html = html.group()
|
||||
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
|
||||
@@ -104,36 +37,35 @@ def getExtrafanart(htmlcode): # 获取剧照
|
||||
|
||||
def main(number2):
|
||||
number=number2.upper()
|
||||
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
|
||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
||||
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
|
||||
soup = BeautifulSoup(htmlcode2, 'lxml')
|
||||
a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
htmlcode = MgsCrawler(htmlcode2)
|
||||
a = MgsCrawler(a2)
|
||||
b = MgsCrawler(b2)
|
||||
#print(b)
|
||||
try:
|
||||
dic = {
|
||||
'title': getTitle(htmlcode).replace("\\n", '').replace(' ', ''),
|
||||
'studio': getStudio(a),
|
||||
'outline': getOutline(b),
|
||||
'runtime': getRuntime(a),
|
||||
'director': getDirector(a),
|
||||
'actor': getActor(a),
|
||||
'release': getRelease(a),
|
||||
'number': getNum(a),
|
||||
'cover': getCover(htmlcode),
|
||||
'imagecut': 1,
|
||||
'tag': getTag(a),
|
||||
'label': getLabel(a),
|
||||
'extrafanart': getExtrafanart(htmlcode),
|
||||
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': '',
|
||||
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
|
||||
'source': 'mgstage.py',
|
||||
'series': getSeries(a),
|
||||
}
|
||||
except Exception as e:
|
||||
if config.getInstance().debug():
|
||||
print(e)
|
||||
dic = {"title": ""}
|
||||
dic = {
|
||||
'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace(' ', '').strip(),
|
||||
'studio': a.getMgsString('//th[contains(text(),"メーカー:")]/../td/a/text()'),
|
||||
'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
|
||||
'runtime': a.getMgsString('//th[contains(text(),"収録時間:")]/../td/a/text()').rstrip('mi'),
|
||||
'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
|
||||
'actor': a.getMgsString('//th[contains(text(),"出演:")]/../td/a/text()'),
|
||||
'release': a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()').replace('/','-'),
|
||||
'number': a.getMgsString('//th[contains(text(),"品番:")]/../td/a/text()'),
|
||||
'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
|
||||
'imagecut': 1,
|
||||
'tag': getTag(a2),
|
||||
'label': a.getMgsString('//th[contains(text(),"シリーズ:")]/../td/a/text()'),
|
||||
'extrafanart': getExtrafanart(htmlcode2),
|
||||
'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()'))).strip(" ['']"),
|
||||
# str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': '',
|
||||
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
|
||||
'source': 'mgstage.py',
|
||||
'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
|
||||
}
|
||||
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
|
||||
@@ -5,7 +5,6 @@ import json
|
||||
import builtins
|
||||
from ADC_function import *
|
||||
from lxml.html import fromstring
|
||||
from multiprocessing import Pool
|
||||
from multiprocessing.dummy import Pool as ThreadPool
|
||||
from difflib import SequenceMatcher
|
||||
from unicodedata import category
|
||||
@@ -13,7 +12,7 @@ from number_parser import is_uncensored
|
||||
|
||||
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
|
||||
|
||||
G_mode_txt = ('顺序执行','线程池','进程池')
|
||||
G_mode_txt = ('顺序执行','线程池')
|
||||
|
||||
class noThread(object):
|
||||
def map(self, fn, param):
|
||||
@@ -25,14 +24,15 @@ class noThread(object):
|
||||
|
||||
|
||||
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
|
||||
def getStoryline(number, title, sites: list=None):
|
||||
def getStoryline(number, title, sites: list=None, 无码=None):
|
||||
start_time = time.time()
|
||||
conf = config.getInstance()
|
||||
if not conf.is_storyline():
|
||||
return ''
|
||||
debug = conf.debug() or conf.storyline_show() == 2
|
||||
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
|
||||
if is_uncensored(number):
|
||||
unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
|
||||
if unc:
|
||||
storyine_sites += conf.storyline_uncensored_site().split(',')
|
||||
else:
|
||||
storyine_sites += conf.storyline_censored_site().split(',')
|
||||
@@ -49,9 +49,8 @@ def getStoryline(number, title, sites: list=None):
|
||||
cores = min(len(apply_sites), os.cpu_count())
|
||||
if cores == 0:
|
||||
return ''
|
||||
run_mode = conf.storyline_mode()
|
||||
assert run_mode in (0,1,2)
|
||||
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
|
||||
run_mode = 1 if conf.storyline_mode() > 0 else 0
|
||||
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
|
||||
results = pool.map(getStoryline_mp, mp_args)
|
||||
sel = ''
|
||||
if not debug and conf.storyline_show() == 0:
|
||||
@@ -62,7 +61,7 @@ def getStoryline(number, title, sites: list=None):
|
||||
if not len(sel):
|
||||
sel = value
|
||||
return sel
|
||||
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
|
||||
# 以下debug结果输出会写入日志
|
||||
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
||||
sel_site = ''
|
||||
for site, desc in zip(apply_sites, results):
|
||||
@@ -80,34 +79,33 @@ def getStoryline(number, title, sites: list=None):
|
||||
|
||||
|
||||
def getStoryline_mp(args):
|
||||
def _inner(site, number, title, debug):
|
||||
start_time = time.time()
|
||||
storyline = None
|
||||
if not isinstance(site, str):
|
||||
return storyline
|
||||
elif site == "airavwiki":
|
||||
storyline = getStoryline_airavwiki(number, debug)
|
||||
elif site == "airav":
|
||||
storyline = getStoryline_airav(number, debug)
|
||||
elif site == "avno1":
|
||||
storyline = getStoryline_avno1(number, debug)
|
||||
elif site == "xcity":
|
||||
storyline = getStoryline_xcity(number, debug)
|
||||
elif site == "amazon":
|
||||
storyline = getStoryline_amazon(title, number, debug)
|
||||
elif site == "58avgo":
|
||||
storyline = getStoryline_58avgo(number, debug)
|
||||
if not debug:
|
||||
return storyline
|
||||
# 进程池模式的子进程getStoryline_*()的print()不会写入日志中,线程池和顺序执行不受影响
|
||||
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
|
||||
site,
|
||||
time.time() - start_time,
|
||||
time.strftime("%H:%M:%S"),
|
||||
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
|
||||
)
|
||||
(site, number, title, debug) = args
|
||||
start_time = time.time()
|
||||
storyline = None
|
||||
if not isinstance(site, str):
|
||||
return storyline
|
||||
return _inner(*args)
|
||||
elif site == "airavwiki":
|
||||
storyline = getStoryline_airavwiki(number, debug)
|
||||
#storyline = getStoryline_airavwiki_super(number, debug)
|
||||
elif site == "airav":
|
||||
storyline = getStoryline_airav(number, debug)
|
||||
elif site == "avno1":
|
||||
storyline = getStoryline_avno1(number, debug)
|
||||
elif site == "xcity":
|
||||
storyline = getStoryline_xcity(number, debug)
|
||||
elif site == "amazon":
|
||||
storyline = getStoryline_amazon(title, number, debug)
|
||||
elif site == "58avgo":
|
||||
storyline = getStoryline_58avgo(number, debug)
|
||||
if not debug:
|
||||
return storyline
|
||||
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
|
||||
site,
|
||||
time.time() - start_time,
|
||||
time.strftime("%H:%M:%S"),
|
||||
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
|
||||
)
|
||||
return storyline
|
||||
|
||||
|
||||
def getStoryline_airav(number, debug):
|
||||
@@ -308,8 +306,8 @@ def getStoryline_amazon(q_title, number, debug):
|
||||
res = session.get(urljoin(res.url, lks[0]))
|
||||
cookie = None
|
||||
lx = fromstring(res.text)
|
||||
titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
|
||||
urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
|
||||
titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
|
||||
urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
|
||||
if not len(urls) or len(urls) != len(titles):
|
||||
raise ValueError("titles not found")
|
||||
idx = amazon_select_one(titles, q_title, number, debug)
|
||||
@@ -325,8 +323,9 @@ def getStoryline_amazon(q_title, number, debug):
|
||||
res = session.get(urljoin(res.url, lks[0]))
|
||||
cookie = None
|
||||
lx = fromstring(res.text)
|
||||
div = lx.xpath('//*[@id="productDescription"]')[0]
|
||||
ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)])
|
||||
p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
|
||||
p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
|
||||
ama_t = ' '.join(p1) + ' '.join(p2)
|
||||
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
|
||||
|
||||
if cookie is None:
|
||||
@@ -406,10 +405,10 @@ def amazon_select_one(a_titles, q_title, number, debug):
|
||||
# debug 模式下记录识别准确率日志
|
||||
if ratio < 0.9:
|
||||
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
|
||||
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
|
||||
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
|
||||
with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
|
||||
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
|
||||
return -1
|
||||
# 被采信的结果日志
|
||||
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
|
||||
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
|
||||
with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
|
||||
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
|
||||
return sel
|
||||
|
||||
@@ -128,7 +128,7 @@ def getOutline(html, number, title):
|
||||
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
|
||||
if len(a):
|
||||
site = [n for n in storyline_site if n in a]
|
||||
g = getStoryline(number, title, site)
|
||||
g = getStoryline(number, title, site, 无码=False)
|
||||
if len(g):
|
||||
return g
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user