deal with websites behind Clo*dfl**e
This commit is contained in:
@@ -30,7 +30,7 @@ def getActor(html):
|
||||
return r
|
||||
|
||||
def getaphoto(url, session):
|
||||
html_page = session.get(url).text if isinstance(session, requests.Session) else get_html(url)
|
||||
html_page = session.get(url).text if session is not None else get_html(url)
|
||||
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
|
||||
img_url = img_prether.findall(html_page)
|
||||
if img_url:
|
||||
@@ -215,14 +215,21 @@ def main(number):
|
||||
if debug:
|
||||
print(f'[!]javdb:select site {javdb_site}')
|
||||
session = None
|
||||
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
|
||||
try:
|
||||
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
|
||||
if debug:
|
||||
raise # try get_html_by_scraper() branch
|
||||
res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session')
|
||||
if not res:
|
||||
raise
|
||||
query_result = res.text
|
||||
except:
|
||||
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies)
|
||||
res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper')
|
||||
if not res:
|
||||
raise ValueError('page not found')
|
||||
query_result = res.text
|
||||
if session is None:
|
||||
raise ValueError('page not found')
|
||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
# javdb sometime returns multiple results,
|
||||
# and the first elememt maybe not the one we are looking for
|
||||
@@ -241,14 +248,12 @@ def main(number):
|
||||
raise ValueError("number not found")
|
||||
correct_url = urls[0]
|
||||
try:
|
||||
if isinstance(session, requests.Session): # get faster benefit from http keep-alive
|
||||
# get faster benefit from http keep-alive
|
||||
javdb_detail_url = urljoin(res.url, correct_url)
|
||||
detail_page = session.get(javdb_detail_url).text
|
||||
else:
|
||||
javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
|
||||
detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
|
||||
except:
|
||||
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
|
||||
session = None
|
||||
|
||||
# etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用
|
||||
lx = etree.fromstring(detail_page, etree.HTMLParser())
|
||||
@@ -309,7 +314,7 @@ def main(number):
|
||||
|
||||
|
||||
except Exception as e:
|
||||
if config.getInstance().debug():
|
||||
if debug:
|
||||
print(e)
|
||||
dic = {"title": ""}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
@@ -324,12 +329,12 @@ if __name__ == "__main__":
|
||||
# print(main('BANK-022'))
|
||||
# print(main('070116-197'))
|
||||
# print(main('093021_539')) # 没有剧照 片商pacopacomama
|
||||
# print(main('FC2-2278260'))
|
||||
print(main('FC2-2278260'))
|
||||
# print(main('FC2-735670'))
|
||||
# print(main('FC2-1174949')) # not found
|
||||
print(main('MVSD-439'))
|
||||
# print(main('EHM0001')) # not found
|
||||
# print(main('FC2-2314275'))
|
||||
print(main('FC2-2314275'))
|
||||
# print(main('EBOD-646'))
|
||||
# print(main('LOVE-262'))
|
||||
print(main('ABP-890'))
|
||||
|
||||
@@ -59,7 +59,7 @@ def getStoryline(number, title, sites: list=None):
|
||||
return value
|
||||
return ''
|
||||
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
|
||||
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
||||
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
||||
first = True
|
||||
sel = ''
|
||||
for i, site in enumerate(apply_sites):
|
||||
@@ -175,7 +175,7 @@ def getStoryline_airavwiki(number, debug):
|
||||
|
||||
except Exception as e:
|
||||
if debug:
|
||||
print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
|
||||
print(f"[-]MP getStoryline_airavwiki Error: {e}, number [{number}].")
|
||||
pass
|
||||
return ''
|
||||
|
||||
@@ -190,7 +190,7 @@ def getStoryline_58avgo(number, debug):
|
||||
result, browser = get_html_by_form(url,
|
||||
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
|
||||
return_type = 'browser')
|
||||
if not result.ok:
|
||||
if not result:
|
||||
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
|
||||
if f'searchresults.aspx?Search={kwd}' not in browser.url:
|
||||
raise ValueError("number not found")
|
||||
@@ -219,6 +219,29 @@ def getStoryline_58avgo(number, debug):
|
||||
|
||||
|
||||
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
|
||||
try:
|
||||
site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc',
|
||||
'hotav.biz','iqq2.xyz','javhq.tv',
|
||||
'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',])
|
||||
url = f'http://{site}/cn/search.php?kw_type=key&kw={number}'
|
||||
lx = fromstring(get_html_by_scraper(url))
|
||||
descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description')
|
||||
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
|
||||
if not descs or not len(descs):
|
||||
raise ValueError(f"number not found")
|
||||
for i, title in enumerate(titles):
|
||||
page_number = title[title.rfind(' '):].strip()
|
||||
if re.search(number, page_number, re.I):
|
||||
return descs[i].strip()
|
||||
raise ValueError(f"page number ->[{page_number}] not match")
|
||||
except Exception as e:
|
||||
if debug:
|
||||
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
|
||||
pass
|
||||
return ''
|
||||
|
||||
|
||||
def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
|
||||
try:
|
||||
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
|
||||
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
|
||||
@@ -343,6 +366,8 @@ def amazon_select_one(a_titles, q_title, number, debug):
|
||||
if pos < 0:
|
||||
if category(char) == 'Nd':
|
||||
return -1
|
||||
if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U):
|
||||
return -1
|
||||
ama_t = ama_t[:cloc]
|
||||
findlen = 0
|
||||
lastpos = -1
|
||||
|
||||
Reference in New Issue
Block a user