剧情简介:新增无码元数据站点,配置文件改为通用、有码、无码三种站点分列

This commit is contained in:
lededev
2021-10-21 20:02:07 +08:00
parent 1f9bf6b4c2
commit 850679705e
7 changed files with 92 additions and 30 deletions

View File

@@ -60,14 +60,9 @@ def get_year(lx: html.HtmlElement) -> str:
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title)
storyline_site = config.getInstance().storyline_site().split(',') if len(g):
a = set(storyline_site) & {'airav', 'avno1'} return g
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site)
if len(g):
return g
return o return o
def get_release(lx: html.HtmlElement) -> str: def get_release(lx: html.HtmlElement) -> str:

View File

@@ -8,8 +8,9 @@ from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher from difflib import SequenceMatcher
from unicodedata import category from unicodedata import category
from number_parser import is_uncensored
G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"} G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池','进程池') G_mode_txt = ('顺序执行','线程池','进程池')
@@ -28,7 +29,16 @@ def getStoryline(number, title, sites: list=None):
conf = config.getInstance() conf = config.getInstance()
debug = conf.debug() or conf.storyline_show() == 2 debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites storyine_sites = conf.storyline_site().split(',') if sites is None else sites
apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] if is_uncensored(number):
storyine_sites += conf.storyline_uncensored_site().split(',')
else:
storyine_sites += conf.storyline_censored_site().split(',')
r_dup = set()
apply_sites = []
for s in storyine_sites:
if s in G_registered_storyline_site and s not in r_dup:
apply_sites.append(s)
r_dup.add(s)
mp_args = ((site, number, title, debug) for site in apply_sites) mp_args = ((site, number, title, debug) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count()) cores = min(len(apply_sites), os.cpu_count())
if cores == 0: if cores == 0:
@@ -80,6 +90,8 @@ def _getStoryline_mp(site, number, title, debug):
storyline = getStoryline_xcity(number, debug) storyline = getStoryline_xcity(number, debug)
elif site == "amazon": elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug) storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug: if not debug:
return storyline return storyline
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
@@ -119,24 +131,63 @@ def getStoryline_airav(number, debug):
return None return None
def getStoryline_58avgo(number, debug):
try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
'', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
'?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
return_type = 'browser')
if not result.ok:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
if f'searchresults.aspx?Search={kwd}' not in browser.url:
raise ValueError("number not found")
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
link = None
for i in range(len(s)):
title = s[i].h3.text.strip()
if re.search(number, title, re.I):
link = s[i]
break;
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or 'playon.aspx' not in browser.url:
raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError("detail page number not match, got ->[{detail_number}]")
return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
try: try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一 ]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
number_up = number.upper()
result, browser = get_html_by_form(url, result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form', form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number_up}, fields = {'kw' : number},
return_type = 'browser') return_type = 'browser')
if not result.ok: if not result.ok:
raise ValueError(f"get_html_by_form('{url}','{number_up}') failed") raise ValueError(f"get_html_by_form('{url}','{number}') failed")
title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() s = browser.page.select('div.type_movie > div > ul > li > div')
page_number = title[title.rfind(' '):].upper() for i in range(len(s)):
if not number_up in page_number: title = s[i].a.h3.text.strip()
raise ValueError(f"page number ->[{page_number}] not match") page_number = title[title.rfind(' '):].strip()
return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() if re.search(number, page_number, re.I):
return s[i]['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e: except Exception as e:
if debug: if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")

View File

@@ -122,7 +122,7 @@ def getDirector(html):
def getOutline(html, number, title): def getOutline(html, number, title):
storyline_site = config.getInstance().storyline_site().split(',') storyline_site = config.getInstance().storyline_site().split(',')
a = set(storyline_site) & {'airav', 'avno1'} a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a): if len(a):
site = [n for n in storyline_site if n in a] site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site) g = getStoryline(number, title, site)

View File

@@ -86,11 +86,16 @@ extrafanart_folder=extrafanart
; 剧情简介 ; 剧情简介
[storyline] [storyline]
; website为javbusjavdbsite为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询取值优先级 ; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的
; 从左到右,靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介xcity和amazon是日语的 ; 可选数据源站点列表。列表内站点同时并发查询,取值优先级从左到右,靠左站点没数据才会采用后面站点获得的。
; 于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。 ; 其中airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1有码无码都能查58avgo只能查无码或者
; 流出破解马赛克的影片(此功能没使用)。
; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
; 设置成不查询可大幅提高刮削速度。
; site= ; site=
site=airav,avno1,xcity,amazon site=avno1
censored_site=airav,xcity,amazon
uncensored_site=58avgo
; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) ; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
run_mode=1 run_mode=1
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因

View File

@@ -243,7 +243,19 @@ class Config:
try: try:
return self.conf.get("storyline", "site") return self.conf.get("storyline", "site")
except: except:
return "airav,avno1,xcity,amazon" return "avno1"
def storyline_censored_site(self) -> str:
try:
return self.conf.get("storyline", "censored_site")
except:
return "airav,xcity,amazon"
def storyline_uncensored_site(self) -> str:
try:
return self.conf.get("storyline", "uncensored_site")
except:
return "58avgo"
def storyline_show(self) -> int: def storyline_show(self) -> int:
try: try:
@@ -354,7 +366,9 @@ class Config:
sec14 = "storyline" sec14 = "storyline"
conf.add_section(sec14) conf.add_section(sec14)
conf.set(sec14, "site", "airav,avno1,xcity,amazon") conf.set(sec14, "site", "avno1")
conf.set(sec14, "censored_site", "airav,xcity,amazon")
conf.set(sec14, "uncensored_site", "58avgo")
conf.set(sec14, "show_result", 0) conf.set(sec14, "show_result", 0)
conf.set(sec14, "run_mode", 1) conf.set(sec14, "run_mode", 1)

View File

@@ -566,10 +566,7 @@ def core_main(file_path, number_th):
c_word = '-C' # 中文字幕影片后缀 c_word = '-C' # 中文字幕影片后缀
# 判断是否无码 # 判断是否无码
if is_uncensored(number): uncensored = 1 if is_uncensored(number) else 0
uncensored = 1
else:
uncensored = 0
if '流出' in filepath or 'uncensored' in filepath: if '流出' in filepath or 'uncensored' in filepath:

View File

@@ -71,7 +71,7 @@ G_TAKE_NUM_RULES = {
'10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'), '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()), 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]), 'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[-|_]{1}(\d{3,4})[^\d]*', x, re.I)[0]) 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0])
} }
def get_number_by_dict(filename: str) -> str: def get_number_by_dict(filename: str) -> str: