Parall query on storyline data

This commit is contained in:
lededev
2021-10-17 21:59:08 +08:00
parent b006aee34d
commit a546c4e83e
7 changed files with 336 additions and 53 deletions

View File

@@ -85,12 +85,11 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
s = None
if isinstance(cookies, dict) and len(cookies): if isinstance(cookies, dict) and len(cookies):
s = requests.Session() s = requests.Session()
requests.utils.add_dict_to_cookiejar(s.cookies, cookies) requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
else:
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
configProxy = config.getInstance().proxy() configProxy = config.getInstance().proxy()
if configProxy.enable: if configProxy.enable:
browser.session.proxies = configProxy.proxies() browser.session.proxies = configProxy.proxies()
@@ -109,12 +108,11 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
s = None
if isinstance(cookies, dict) and len(cookies): if isinstance(cookies, dict) and len(cookies):
s = requests.Session() s = requests.Session()
requests.utils.add_dict_to_cookiejar(s.cookies, cookies) requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
else:
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
configProxy = config.getInstance().proxy() configProxy = config.getInstance().proxy()
if configProxy.enable: if configProxy.enable:
browser.session.proxies = configProxy.proxies() browser.session.proxies = configProxy.proxies()

View File

@@ -416,7 +416,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number):
print('[!]', err) print('[!]', err)
if __name__ == '__main__': def main():
version = '5.0.1' version = '5.0.1'
urllib3.disable_warnings() #Ignore http proxy warning urllib3.disable_warnings() #Ignore http proxy warning
@@ -483,6 +483,7 @@ if __name__ == '__main__':
count = 0 count = 0
count_all = str(len(movie_list)) count_all = str(len(movie_list))
print('[+]Find', count_all, 'movies.') print('[+]Find', count_all, 'movies.')
print('[*]======================================================')
stop_count = conf.stop_counter() stop_count = conf.stop_counter()
if stop_count<1: if stop_count<1:
stop_count = 999999 stop_count = 999999
@@ -517,3 +518,8 @@ if __name__ == '__main__':
input("Press enter key exit, you can check the error message before you exit...") input("Press enter key exit, you can check the error message before you exit...")
sys.exit(0) sys.exit(0)
import multiprocessing
if __name__ == '__main__':
multiprocessing.freeze_support()
main()

View File

@@ -6,6 +6,7 @@ from lxml import etree#need install
from bs4 import BeautifulSoup#need install from bs4 import BeautifulSoup#need install
import json import json
from ADC_function import * from ADC_function import *
from WebCrawler.storyline import getStoryline
import inspect import inspect
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
@@ -91,33 +92,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404函数暂时
except: except:
pass pass
return '' return ''
def getOutline(number): #获取剧情介绍 从avno1.cc取得 def getOutline(number, title): #获取剧情介绍 多进程并发查询
try: return getStoryline(number,title)
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
number_up = number.upper()
result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number_up},
return_type = 'browser')
if not result.ok:
raise
title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
page_number = title[title.rfind(' '):].upper()
if not number_up in page_number:
raise
return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
except:
pass
try:
from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
detail_html, browser = open_by_browser(number)
return xcity_getOutline(detail_html)
except:
pass
return ''
def getSerise(htmlcode): #获取系列 已修改 def getSerise(htmlcode): #获取系列 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
# 如果记录中冇导演系列排在第6位 # 如果记录中冇导演系列排在第6位
@@ -156,11 +132,12 @@ def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
if "<title>404 Page Not Found" in htmlcode: if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found') raise Exception('404 page not found')
title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','')
dic = { dic = {
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), 'title': title,
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': getYear(htmlcode), 'year': getYear(htmlcode),
'outline': getOutline(number), 'outline': getOutline(number, title),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode), 'director': getDirector(htmlcode),
'actor': getActor(htmlcode), 'actor': getActor(htmlcode),
@@ -189,11 +166,12 @@ def main(number):
htmlcode = get_html('https://www.javbus.com/' + number) htmlcode = get_html('https://www.javbus.com/' + number)
if "<title>404 Page Not Found" in htmlcode: if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found') raise Exception('404 page not found')
title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode)))
dic = { dic = {
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'title': title,
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': str(re.search('\d{4}', getYear(htmlcode)).group()), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
'outline': getOutline(number), 'outline': getOutline(number, title),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode), 'director': getDirector(htmlcode),
'actor': getActor(htmlcode), 'actor': getActor(htmlcode),
@@ -225,7 +203,11 @@ def main(number):
return js return js
if __name__ == "__main__" : if __name__ == "__main__" :
#print(main('ADV-R0624')) # 404 config.G_conf_override['debug_mode:switch'] = True
print(main('ABP-888'))
print(main('ABP-960'))
# print(main('ADV-R0624')) # 404
# print(main('MMNT-010'))
print(main('ipx-292')) print(main('ipx-292'))
print(main('CEMD-011')) print(main('CEMD-011'))
print(main('CJOD-278')) print(main('CJOD-278'))

View File

@@ -1,13 +1,11 @@
import sys import sys
from mechanicalsoup.stateful_browser import StatefulBrowser
sys.path.append('../') sys.path.append('../')
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys from mechanicalsoup.stateful_browser import StatefulBrowser
from WebCrawler.storyline import getStoryline
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -206,9 +204,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404函数暂时
except: except:
pass pass
return '' return ''
def getOutline(number): #获取剧情介绍 def getOutline(number, title): #获取剧情介绍 多进程并发查询
from WebCrawler.javbus import getOutline as javbus_getOutline return getStoryline(number,title)
return javbus_getOutline(number)
def getSeries(a): def getSeries(a):
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
@@ -309,7 +306,7 @@ def main(number):
'actor': getActor(detail_page), 'actor': getActor(detail_page),
'title': title, 'title': title,
'studio': getStudio(detail_page), 'studio': getStudio(detail_page),
'outline': getOutline(number), 'outline': getOutline(number, title),
'runtime': getRuntime(detail_page), 'runtime': getRuntime(detail_page),
'director': getDirector(detail_page), 'director': getDirector(detail_page),
'release': getRelease(detail_page), 'release': getRelease(detail_page),
@@ -350,11 +347,13 @@ if __name__ == "__main__":
# print(main('blacked.20.05.30')) # print(main('blacked.20.05.30'))
# print(main('AGAV-042')) # print(main('AGAV-042'))
# print(main('BANK-022')) # print(main('BANK-022'))
print(main('070116-197')) # print(main('070116-197'))
print(main('093021_539')) # 没有剧照 片商pacopacomama # print(main('093021_539')) # 没有剧照 片商pacopacomama
print(main('FC2-2278260')) # print(main('FC2-2278260'))
print(main('FC2-735670')) # print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found # print(main('FC2-1174949')) # not found
print(main('MVSD-439')) print(main('MVSD-439'))
# print(main('EHM0001')) # not found # print(main('EHM0001')) # not found
print(main('FC2-2314275')) # print(main('FC2-2314275'))
# print(main('EBOD-646'))
print(main('LOVE-262'))

270
WebCrawler/storyline.py Normal file
View File

@@ -0,0 +1,270 @@
import sys
sys.path.append('../')
import re
import json
from ADC_function import *
from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
from multiprocessing import Pool
from difflib import SequenceMatcher
from unicodedata import category
G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"}
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title):
start_time = time.time()
conf = config.getInstance()
debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',')
apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
mp_args = ((site, number, title, debug) for site in apply_sites)
# choose process pool not thread pool because https://www.python.org/dev/peps/pep-0371/
with Pool() as proc_pool:
result = proc_pool.map(getStoryline_mp, mp_args)
if not debug and conf.storyline_show() == 0:
for value in result:
if isinstance(value, str) and len(value):
return value
return ''
# 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示
cnt = len(apply_sites)
s = f'[!]MP Storyline 运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
first = True
sel = ''
for i in range(cnt):
sl = len(result[i])if isinstance(result[i], str) else 0
if sl and first:
s += f'[选中结果{apply_sites[i]}字数:{sl}]'
first = False
sel = result[i]
elif sl:
s += f'{apply_sites[i]}字数:{sl}'
else:
s += f'{apply_sites[i]}:空'
print(s)
return sel
def getStoryline_mp(args):
return _getStoryline_mp(*args)
# 注新进程的print()不会写入日志中将来调试修复失效数据源需直接查看标准输出issue信息需截图屏幕
def _getStoryline_mp(site, number, title, debug):
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
if not debug:
return storyline
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug):
try:
number_up = number
site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, browser = get_html_by_browser(url, return_type='browser')
if not res.ok:
raise ValueError(f"get_html_by_browser('{url}') failed")
avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div')
if number_up not in avs.select_one('a > h3').text.upper():
raise ValueError("number not found")
detail_url = avs.select_one('a')['href']
res = browser.open_relative(detail_url)
if not res.ok:
raise ValueError(f"browser.open_relative('{detail_url}') failed")
t = browser.page.select_one('head > title').text
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper()
if number.upper() != airav_number:
raise ValueError(f"page number ->[{airav_number}] not match")
desc = browser.page.select_one('li.introduction > span').text.strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP getOutline_amazon Error: {e},number [{number}].")
pass
return None
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
number_up = number.upper()
result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number_up},
return_type = 'browser')
if not result.ok:
raise ValueError(f"get_html_by_form('{url}','{number_up}') failed")
title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
page_number = title[title.rfind(' '):].upper()
if not number_up in page_number:
raise ValueError(f"page number ->[{page_number}] not match")
return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
try:
#xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("detail page not found")
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
pass
return ''
def getStoryline_amazon(q_title, number, debug):
if not isinstance(q_title, str) or not len(q_title):
return None
try:
amazon_cookie, _ = load_cookies('amazon.json')
cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
url = "https://www.amazon.co.jp/s?k=" + q_title
res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser')
if not res.ok:
raise ValueError("get_html_by_browser() failed")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
if isinstance(lks, list) and len(lks):
browser.follow_link(lks[0])
cookie = None
html = etree.fromstring(str(browser.page), etree.HTMLParser())
titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug)
if not isinstance(idx, int) or idx < 0:
raise ValueError("title and number not found")
furl = urls[idx]
r = browser.open_relative(furl)
if not r.ok:
raise ValueError("browser.open_relative()) failed.")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
if isinstance(lks, list) and len(lks):
browser.follow_link(lks[0])
cookie = None
ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip()
ama_t = re.sub(r'審査番号:\d+', '', ama_t)
if cookie is None:
# 自动创建的cookies文件放在搜索路径表的末端最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
ama_save = Path.home() / ".local/share/avdc/amazon.json"
ama_save.parent.mkdir(parents=True, exist_ok=True)
ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
return ama_t
except Exception as e:
if debug:
print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
pass
return None
# 查货架中DVD和蓝光商品中标题相似度高的
def amazon_select_one(a_titles, q_title, number, debug):
sel = -1
ratio = 0
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
for loc in range(len(a_titles)):
t = a_titles[loc]
if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
ratio = 1.0
sel = loc
save_t_ = t
break
if not re.search('DVD|Blu-ray', t, re.I):
continue
ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
findlen = 0
lastpos = -1
cnt = len(ama_t)
for c in reversed(ama_t):
cnt -= 1
pos = que_t.rfind(c)
if lastpos >= 0:
pos_near = que_t[:lastpos].rfind(c)
if pos_near < 0:
findlen = 0
lastpos = -1
ama_t = ama_t[:cnt+1]
else:
pos = pos_near
if pos < 0:
if category(c) == 'Nd':
return -1
ama_t = ama_t[:cnt]
findlen = 0
lastpos = -1
continue
if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
findlen += 1
lastpos = pos
if findlen >= 4:
break
continue
findlen = 1
lastpos = pos
if findlen==0:
return -1
r = SequenceMatcher(None, ama_t, que_t).ratio()
if r > ratio:
sel = loc
ratio = r
save_t_ = ama_t
if ratio > 0.999:
break
if ratio < 0.5:
return -1
if not debug:
# 目前采信相似度高于0.9的结果
return sel if ratio >= 0.9 else -1
# debug 模式下记录识别准确率日志
if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
return -1
# 被采信的结果日志
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
return sel

View File

@@ -7,7 +7,7 @@ soft_link=0
failed_move=1 failed_move=1
auto_exit=0 auto_exit=0
transalte_to_sc=0 transalte_to_sc=0
multi_threading=1 multi_threading=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧) ;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
actor_gender=female actor_gender=female
del_empty_folder=1 del_empty_folder=1
@@ -85,3 +85,12 @@ water=2
switch=0 switch=0
extrafanart_folder=extrafanart extrafanart_folder=extrafanart
; 剧情简介
[storyline]
; website为javbus或javdb时site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询取值优先级
; 从左到右靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介xcity和amazon是日语的
; 于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。
; site=
site=airav,avno1,xcity,amazon
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因
show_result=0

View File

@@ -240,6 +240,20 @@ class Config:
def debug(self) -> bool: def debug(self) -> bool:
return self.getboolean_override("debug_mode", "switch") return self.getboolean_override("debug_mode", "switch")
def storyline_site(self) -> str:
try:
return self.conf.get("storyline", "site")
except:
return "airav,avno1,xcity,amazon"
def storyline_show(self) -> int:
try:
v = self.conf.getint("storyline", "show_result")
return v if v in (0,1,2) else 2 if v > 2 else 0
except:
return 0
@staticmethod @staticmethod
def _exit(sec: str) -> None: def _exit(sec: str) -> None:
print("[-] Read config error! Please check the {} section in config.ini", sec) print("[-] Read config error! Please check the {} section in config.ini", sec)
@@ -333,6 +347,11 @@ class Config:
conf.set(sec13, "switch", 1) conf.set(sec13, "switch", 1)
conf.set(sec13, "extrafanart_folder", "extrafanart") conf.set(sec13, "extrafanart_folder", "extrafanart")
sec14 = "storyline"
conf.add_section(sec14)
conf.set(sec14, "site", "airav,avno1,xcity,amazon")
conf.set(sec14, "show_result", 0)
return conf return conf