Parall query on storyline data
This commit is contained in:
@@ -85,12 +85,11 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
|||||||
|
|
||||||
|
|
||||||
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
|
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
|
||||||
|
s = None
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
|
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
|
||||||
else:
|
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
|
|
||||||
configProxy = config.getInstance().proxy()
|
configProxy = config.getInstance().proxy()
|
||||||
if configProxy.enable:
|
if configProxy.enable:
|
||||||
browser.session.proxies = configProxy.proxies()
|
browser.session.proxies = configProxy.proxies()
|
||||||
@@ -109,12 +108,11 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
|
|||||||
|
|
||||||
|
|
||||||
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
|
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
|
||||||
|
s = None
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
|
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
|
||||||
else:
|
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
|
|
||||||
configProxy = config.getInstance().proxy()
|
configProxy = config.getInstance().proxy()
|
||||||
if configProxy.enable:
|
if configProxy.enable:
|
||||||
browser.session.proxies = configProxy.proxies()
|
browser.session.proxies = configProxy.proxies()
|
||||||
|
|||||||
@@ -416,7 +416,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number):
|
|||||||
print('[!]', err)
|
print('[!]', err)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def main():
|
||||||
version = '5.0.1'
|
version = '5.0.1'
|
||||||
urllib3.disable_warnings() #Ignore http proxy warning
|
urllib3.disable_warnings() #Ignore http proxy warning
|
||||||
|
|
||||||
@@ -483,6 +483,7 @@ if __name__ == '__main__':
|
|||||||
count = 0
|
count = 0
|
||||||
count_all = str(len(movie_list))
|
count_all = str(len(movie_list))
|
||||||
print('[+]Find', count_all, 'movies.')
|
print('[+]Find', count_all, 'movies.')
|
||||||
|
print('[*]======================================================')
|
||||||
stop_count = conf.stop_counter()
|
stop_count = conf.stop_counter()
|
||||||
if stop_count<1:
|
if stop_count<1:
|
||||||
stop_count = 999999
|
stop_count = 999999
|
||||||
@@ -517,3 +518,8 @@ if __name__ == '__main__':
|
|||||||
input("Press enter key exit, you can check the error message before you exit...")
|
input("Press enter key exit, you can check the error message before you exit...")
|
||||||
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
import multiprocessing
|
||||||
|
if __name__ == '__main__':
|
||||||
|
multiprocessing.freeze_support()
|
||||||
|
main()
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from lxml import etree#need install
|
|||||||
from bs4 import BeautifulSoup#need install
|
from bs4 import BeautifulSoup#need install
|
||||||
import json
|
import json
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
|
from WebCrawler.storyline import getStoryline
|
||||||
import inspect
|
import inspect
|
||||||
|
|
||||||
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
||||||
@@ -91,33 +92,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return ''
|
return ''
|
||||||
def getOutline(number): #获取剧情介绍 从avno1.cc取得
|
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
||||||
try:
|
return getStoryline(number,title)
|
||||||
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
|
|
||||||
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
|
|
||||||
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
|
|
||||||
]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一
|
|
||||||
number_up = number.upper()
|
|
||||||
result, browser = get_html_by_form(url,
|
|
||||||
form_select='div.wrapper > div.header > div.search > form',
|
|
||||||
fields = {'kw' : number_up},
|
|
||||||
return_type = 'browser')
|
|
||||||
if not result.ok:
|
|
||||||
raise
|
|
||||||
title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
|
|
||||||
page_number = title[title.rfind(' '):].upper()
|
|
||||||
if not number_up in page_number:
|
|
||||||
raise
|
|
||||||
return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
|
|
||||||
detail_html, browser = open_by_browser(number)
|
|
||||||
return xcity_getOutline(detail_html)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return ''
|
|
||||||
def getSerise(htmlcode): #获取系列 已修改
|
def getSerise(htmlcode): #获取系列 已修改
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||||
# 如果记录中冇导演,系列排在第6位
|
# 如果记录中冇导演,系列排在第6位
|
||||||
@@ -156,11 +132,12 @@ def main_uncensored(number):
|
|||||||
htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
|
htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
|
||||||
if "<title>404 Page Not Found" in htmlcode:
|
if "<title>404 Page Not Found" in htmlcode:
|
||||||
raise Exception('404 page not found')
|
raise Exception('404 page not found')
|
||||||
|
title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','')
|
||||||
dic = {
|
dic = {
|
||||||
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
|
'title': title,
|
||||||
'studio': getStudio(htmlcode),
|
'studio': getStudio(htmlcode),
|
||||||
'year': getYear(htmlcode),
|
'year': getYear(htmlcode),
|
||||||
'outline': getOutline(number),
|
'outline': getOutline(number, title),
|
||||||
'runtime': getRuntime(htmlcode),
|
'runtime': getRuntime(htmlcode),
|
||||||
'director': getDirector(htmlcode),
|
'director': getDirector(htmlcode),
|
||||||
'actor': getActor(htmlcode),
|
'actor': getActor(htmlcode),
|
||||||
@@ -189,11 +166,12 @@ def main(number):
|
|||||||
htmlcode = get_html('https://www.javbus.com/' + number)
|
htmlcode = get_html('https://www.javbus.com/' + number)
|
||||||
if "<title>404 Page Not Found" in htmlcode:
|
if "<title>404 Page Not Found" in htmlcode:
|
||||||
raise Exception('404 page not found')
|
raise Exception('404 page not found')
|
||||||
|
title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode)))
|
||||||
dic = {
|
dic = {
|
||||||
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
|
'title': title,
|
||||||
'studio': getStudio(htmlcode),
|
'studio': getStudio(htmlcode),
|
||||||
'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
|
'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
|
||||||
'outline': getOutline(number),
|
'outline': getOutline(number, title),
|
||||||
'runtime': getRuntime(htmlcode),
|
'runtime': getRuntime(htmlcode),
|
||||||
'director': getDirector(htmlcode),
|
'director': getDirector(htmlcode),
|
||||||
'actor': getActor(htmlcode),
|
'actor': getActor(htmlcode),
|
||||||
@@ -225,7 +203,11 @@ def main(number):
|
|||||||
return js
|
return js
|
||||||
|
|
||||||
if __name__ == "__main__" :
|
if __name__ == "__main__" :
|
||||||
#print(main('ADV-R0624')) # 404
|
config.G_conf_override['debug_mode:switch'] = True
|
||||||
|
print(main('ABP-888'))
|
||||||
|
print(main('ABP-960'))
|
||||||
|
# print(main('ADV-R0624')) # 404
|
||||||
|
# print(main('MMNT-010'))
|
||||||
print(main('ipx-292'))
|
print(main('ipx-292'))
|
||||||
print(main('CEMD-011'))
|
print(main('CEMD-011'))
|
||||||
print(main('CJOD-278'))
|
print(main('CJOD-278'))
|
||||||
|
|||||||
@@ -1,13 +1,11 @@
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
from mechanicalsoup.stateful_browser import StatefulBrowser
|
|
||||||
sys.path.append('../')
|
sys.path.append('../')
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
# import sys
|
from mechanicalsoup.stateful_browser import StatefulBrowser
|
||||||
|
from WebCrawler.storyline import getStoryline
|
||||||
# import io
|
# import io
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||||
|
|
||||||
@@ -206,9 +204,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return ''
|
return ''
|
||||||
def getOutline(number): #获取剧情介绍
|
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
||||||
from WebCrawler.javbus import getOutline as javbus_getOutline
|
return getStoryline(number,title)
|
||||||
return javbus_getOutline(number)
|
|
||||||
def getSeries(a):
|
def getSeries(a):
|
||||||
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
|
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
@@ -309,7 +306,7 @@ def main(number):
|
|||||||
'actor': getActor(detail_page),
|
'actor': getActor(detail_page),
|
||||||
'title': title,
|
'title': title,
|
||||||
'studio': getStudio(detail_page),
|
'studio': getStudio(detail_page),
|
||||||
'outline': getOutline(number),
|
'outline': getOutline(number, title),
|
||||||
'runtime': getRuntime(detail_page),
|
'runtime': getRuntime(detail_page),
|
||||||
'director': getDirector(detail_page),
|
'director': getDirector(detail_page),
|
||||||
'release': getRelease(detail_page),
|
'release': getRelease(detail_page),
|
||||||
@@ -350,11 +347,13 @@ if __name__ == "__main__":
|
|||||||
# print(main('blacked.20.05.30'))
|
# print(main('blacked.20.05.30'))
|
||||||
# print(main('AGAV-042'))
|
# print(main('AGAV-042'))
|
||||||
# print(main('BANK-022'))
|
# print(main('BANK-022'))
|
||||||
print(main('070116-197'))
|
# print(main('070116-197'))
|
||||||
print(main('093021_539')) # 没有剧照 片商pacopacomama
|
# print(main('093021_539')) # 没有剧照 片商pacopacomama
|
||||||
print(main('FC2-2278260'))
|
# print(main('FC2-2278260'))
|
||||||
print(main('FC2-735670'))
|
# print(main('FC2-735670'))
|
||||||
# print(main('FC2-1174949')) # not found
|
# print(main('FC2-1174949')) # not found
|
||||||
print(main('MVSD-439'))
|
print(main('MVSD-439'))
|
||||||
# print(main('EHM0001')) # not found
|
# print(main('EHM0001')) # not found
|
||||||
print(main('FC2-2314275'))
|
# print(main('FC2-2314275'))
|
||||||
|
# print(main('EBOD-646'))
|
||||||
|
print(main('LOVE-262'))
|
||||||
|
|||||||
270
WebCrawler/storyline.py
Normal file
270
WebCrawler/storyline.py
Normal file
@@ -0,0 +1,270 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
from ADC_function import *
|
||||||
|
from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
|
||||||
|
from multiprocessing import Pool
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from unicodedata import category
|
||||||
|
|
||||||
|
G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"}
|
||||||
|
|
||||||
|
|
||||||
|
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
|
||||||
|
def getStoryline(number, title):
|
||||||
|
start_time = time.time()
|
||||||
|
conf = config.getInstance()
|
||||||
|
debug = conf.debug() or conf.storyline_show() == 2
|
||||||
|
storyine_sites = conf.storyline_site().split(',')
|
||||||
|
apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
|
||||||
|
mp_args = ((site, number, title, debug) for site in apply_sites)
|
||||||
|
# choose process pool not thread pool because https://www.python.org/dev/peps/pep-0371/
|
||||||
|
with Pool() as proc_pool:
|
||||||
|
result = proc_pool.map(getStoryline_mp, mp_args)
|
||||||
|
if not debug and conf.storyline_show() == 0:
|
||||||
|
for value in result:
|
||||||
|
if isinstance(value, str) and len(value):
|
||||||
|
return value
|
||||||
|
return ''
|
||||||
|
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
|
||||||
|
cnt = len(apply_sites)
|
||||||
|
s = f'[!]MP Storyline 运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
||||||
|
first = True
|
||||||
|
sel = ''
|
||||||
|
for i in range(cnt):
|
||||||
|
sl = len(result[i])if isinstance(result[i], str) else 0
|
||||||
|
if sl and first:
|
||||||
|
s += f',[选中结果{apply_sites[i]}字数:{sl}]'
|
||||||
|
first = False
|
||||||
|
sel = result[i]
|
||||||
|
elif sl:
|
||||||
|
s += f',{apply_sites[i]}字数:{sl}'
|
||||||
|
else:
|
||||||
|
s += f',{apply_sites[i]}:空'
|
||||||
|
print(s)
|
||||||
|
return sel
|
||||||
|
|
||||||
|
|
||||||
|
def getStoryline_mp(args):
|
||||||
|
return _getStoryline_mp(*args)
|
||||||
|
|
||||||
|
|
||||||
|
# 注:新进程的print()不会写入日志中,将来调试修复失效数据源需直接查看标准输出,issue信息需截图屏幕
|
||||||
|
def _getStoryline_mp(site, number, title, debug):
|
||||||
|
start_time = time.time()
|
||||||
|
storyline = None
|
||||||
|
if not isinstance(site, str):
|
||||||
|
return storyline
|
||||||
|
elif site == "airav":
|
||||||
|
storyline = getStoryline_airav(number, debug)
|
||||||
|
elif site == "avno1":
|
||||||
|
storyline = getStoryline_avno1(number, debug)
|
||||||
|
elif site == "xcity":
|
||||||
|
storyline = getStoryline_xcity(number, debug)
|
||||||
|
elif site == "amazon":
|
||||||
|
storyline = getStoryline_amazon(title, number, debug)
|
||||||
|
if not debug:
|
||||||
|
return storyline
|
||||||
|
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
|
||||||
|
site,
|
||||||
|
time.time() - start_time,
|
||||||
|
time.strftime("%H:%M:%S"),
|
||||||
|
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
|
||||||
|
)
|
||||||
|
return storyline
|
||||||
|
|
||||||
|
|
||||||
|
def getStoryline_airav(number, debug):
|
||||||
|
try:
|
||||||
|
number_up = number
|
||||||
|
site = secrets.choice(('airav.cc','airav4.club'))
|
||||||
|
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
|
||||||
|
res, browser = get_html_by_browser(url, return_type='browser')
|
||||||
|
if not res.ok:
|
||||||
|
raise ValueError(f"get_html_by_browser('{url}') failed")
|
||||||
|
avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div')
|
||||||
|
if number_up not in avs.select_one('a > h3').text.upper():
|
||||||
|
raise ValueError("number not found")
|
||||||
|
detail_url = avs.select_one('a')['href']
|
||||||
|
res = browser.open_relative(detail_url)
|
||||||
|
if not res.ok:
|
||||||
|
raise ValueError(f"browser.open_relative('{detail_url}') failed")
|
||||||
|
t = browser.page.select_one('head > title').text
|
||||||
|
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper()
|
||||||
|
if number.upper() != airav_number:
|
||||||
|
raise ValueError(f"page number ->[{airav_number}] not match")
|
||||||
|
desc = browser.page.select_one('li.introduction > span').text.strip()
|
||||||
|
return desc
|
||||||
|
except Exception as e:
|
||||||
|
if debug:
|
||||||
|
print(f"[-]MP getOutline_amazon Error: {e},number [{number}].")
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
|
||||||
|
try:
|
||||||
|
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
|
||||||
|
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
|
||||||
|
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
|
||||||
|
]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一
|
||||||
|
number_up = number.upper()
|
||||||
|
result, browser = get_html_by_form(url,
|
||||||
|
form_select='div.wrapper > div.header > div.search > form',
|
||||||
|
fields = {'kw' : number_up},
|
||||||
|
return_type = 'browser')
|
||||||
|
if not result.ok:
|
||||||
|
raise ValueError(f"get_html_by_form('{url}','{number_up}') failed")
|
||||||
|
title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
|
||||||
|
page_number = title[title.rfind(' '):].upper()
|
||||||
|
if not number_up in page_number:
|
||||||
|
raise ValueError(f"page number ->[{page_number}] not match")
|
||||||
|
return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
|
||||||
|
except Exception as e:
|
||||||
|
if debug:
|
||||||
|
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
|
||||||
|
pass
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
|
||||||
|
try:
|
||||||
|
#xcity_number = number.replace('-','')
|
||||||
|
query_result, browser = get_html_by_form(
|
||||||
|
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
|
||||||
|
fields = {'q' : xcity_number.lower()},
|
||||||
|
return_type = 'browser')
|
||||||
|
if not query_result or not query_result.ok:
|
||||||
|
raise ValueError("page not found")
|
||||||
|
result = browser.follow_link(browser.links('avod\/detail')[0])
|
||||||
|
if not result.ok:
|
||||||
|
raise ValueError("detail page not found")
|
||||||
|
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
|
||||||
|
except Exception as e:
|
||||||
|
if debug:
|
||||||
|
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
|
||||||
|
pass
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def getStoryline_amazon(q_title, number, debug):
|
||||||
|
if not isinstance(q_title, str) or not len(q_title):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
amazon_cookie, _ = load_cookies('amazon.json')
|
||||||
|
cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
|
||||||
|
url = "https://www.amazon.co.jp/s?k=" + q_title
|
||||||
|
res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser')
|
||||||
|
if not res.ok:
|
||||||
|
raise ValueError("get_html_by_browser() failed")
|
||||||
|
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
|
||||||
|
if isinstance(lks, list) and len(lks):
|
||||||
|
browser.follow_link(lks[0])
|
||||||
|
cookie = None
|
||||||
|
html = etree.fromstring(str(browser.page), etree.HTMLParser())
|
||||||
|
titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
|
||||||
|
urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
|
||||||
|
if not len(urls) or len(urls) != len(titles):
|
||||||
|
raise ValueError("titles not found")
|
||||||
|
idx = amazon_select_one(titles, q_title, number, debug)
|
||||||
|
if not isinstance(idx, int) or idx < 0:
|
||||||
|
raise ValueError("title and number not found")
|
||||||
|
furl = urls[idx]
|
||||||
|
r = browser.open_relative(furl)
|
||||||
|
if not r.ok:
|
||||||
|
raise ValueError("browser.open_relative()) failed.")
|
||||||
|
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
|
||||||
|
if isinstance(lks, list) and len(lks):
|
||||||
|
browser.follow_link(lks[0])
|
||||||
|
cookie = None
|
||||||
|
|
||||||
|
ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip()
|
||||||
|
ama_t = re.sub(r'審査番号:\d+', '', ama_t)
|
||||||
|
|
||||||
|
if cookie is None:
|
||||||
|
# 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
|
||||||
|
ama_save = Path.home() / ".local/share/avdc/amazon.json"
|
||||||
|
ama_save.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
|
||||||
|
|
||||||
|
return ama_t
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if debug:
|
||||||
|
print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 查货架中DVD和蓝光商品中标题相似度高的
|
||||||
|
def amazon_select_one(a_titles, q_title, number, debug):
|
||||||
|
sel = -1
|
||||||
|
ratio = 0
|
||||||
|
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
|
||||||
|
for loc in range(len(a_titles)):
|
||||||
|
t = a_titles[loc]
|
||||||
|
if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
|
||||||
|
ratio = 1.0
|
||||||
|
sel = loc
|
||||||
|
save_t_ = t
|
||||||
|
break
|
||||||
|
if not re.search('DVD|Blu-ray', t, re.I):
|
||||||
|
continue
|
||||||
|
ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))
|
||||||
|
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
|
||||||
|
findlen = 0
|
||||||
|
lastpos = -1
|
||||||
|
cnt = len(ama_t)
|
||||||
|
for c in reversed(ama_t):
|
||||||
|
cnt -= 1
|
||||||
|
pos = que_t.rfind(c)
|
||||||
|
if lastpos >= 0:
|
||||||
|
pos_near = que_t[:lastpos].rfind(c)
|
||||||
|
if pos_near < 0:
|
||||||
|
findlen = 0
|
||||||
|
lastpos = -1
|
||||||
|
ama_t = ama_t[:cnt+1]
|
||||||
|
else:
|
||||||
|
pos = pos_near
|
||||||
|
if pos < 0:
|
||||||
|
if category(c) == 'Nd':
|
||||||
|
return -1
|
||||||
|
ama_t = ama_t[:cnt]
|
||||||
|
findlen = 0
|
||||||
|
lastpos = -1
|
||||||
|
continue
|
||||||
|
if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
|
||||||
|
findlen += 1
|
||||||
|
lastpos = pos
|
||||||
|
if findlen >= 4:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
findlen = 1
|
||||||
|
lastpos = pos
|
||||||
|
if findlen==0:
|
||||||
|
return -1
|
||||||
|
r = SequenceMatcher(None, ama_t, que_t).ratio()
|
||||||
|
if r > ratio:
|
||||||
|
sel = loc
|
||||||
|
ratio = r
|
||||||
|
save_t_ = ama_t
|
||||||
|
if ratio > 0.999:
|
||||||
|
break
|
||||||
|
|
||||||
|
if ratio < 0.5:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
if not debug:
|
||||||
|
# 目前采信相似度高于0.9的结果
|
||||||
|
return sel if ratio >= 0.9 else -1
|
||||||
|
|
||||||
|
# debug 模式下记录识别准确率日志
|
||||||
|
if ratio < 0.9:
|
||||||
|
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
|
||||||
|
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
|
||||||
|
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
|
||||||
|
return -1
|
||||||
|
# 被采信的结果日志
|
||||||
|
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
|
||||||
|
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
|
||||||
|
return sel
|
||||||
11
config.ini
11
config.ini
@@ -7,7 +7,7 @@ soft_link=0
|
|||||||
failed_move=1
|
failed_move=1
|
||||||
auto_exit=0
|
auto_exit=0
|
||||||
transalte_to_sc=0
|
transalte_to_sc=0
|
||||||
multi_threading=1
|
multi_threading=0
|
||||||
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
|
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
|
||||||
actor_gender=female
|
actor_gender=female
|
||||||
del_empty_folder=1
|
del_empty_folder=1
|
||||||
@@ -85,3 +85,12 @@ water=2
|
|||||||
switch=0
|
switch=0
|
||||||
extrafanart_folder=extrafanart
|
extrafanart_folder=extrafanart
|
||||||
|
|
||||||
|
; 剧情简介
|
||||||
|
[storyline]
|
||||||
|
; website为javbus或javdb时,site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询,取值优先级
|
||||||
|
; 从左到右,靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介,xcity和amazon是日语的,由
|
||||||
|
; 于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。
|
||||||
|
; site=
|
||||||
|
site=airav,avno1,xcity,amazon
|
||||||
|
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因
|
||||||
|
show_result=0
|
||||||
|
|||||||
19
config.py
19
config.py
@@ -240,6 +240,20 @@ class Config:
|
|||||||
def debug(self) -> bool:
|
def debug(self) -> bool:
|
||||||
return self.getboolean_override("debug_mode", "switch")
|
return self.getboolean_override("debug_mode", "switch")
|
||||||
|
|
||||||
|
def storyline_site(self) -> str:
|
||||||
|
try:
|
||||||
|
return self.conf.get("storyline", "site")
|
||||||
|
except:
|
||||||
|
return "airav,avno1,xcity,amazon"
|
||||||
|
|
||||||
|
def storyline_show(self) -> int:
|
||||||
|
try:
|
||||||
|
v = self.conf.getint("storyline", "show_result")
|
||||||
|
return v if v in (0,1,2) else 2 if v > 2 else 0
|
||||||
|
except:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _exit(sec: str) -> None:
|
def _exit(sec: str) -> None:
|
||||||
print("[-] Read config error! Please check the {} section in config.ini", sec)
|
print("[-] Read config error! Please check the {} section in config.ini", sec)
|
||||||
@@ -333,6 +347,11 @@ class Config:
|
|||||||
conf.set(sec13, "switch", 1)
|
conf.set(sec13, "switch", 1)
|
||||||
conf.set(sec13, "extrafanart_folder", "extrafanart")
|
conf.set(sec13, "extrafanart_folder", "extrafanart")
|
||||||
|
|
||||||
|
sec14 = "storyline"
|
||||||
|
conf.add_section(sec14)
|
||||||
|
conf.set(sec14, "site", "airav,avno1,xcity,amazon")
|
||||||
|
conf.set(sec14, "show_result", 0)
|
||||||
|
|
||||||
return conf
|
return conf
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user