javdb.py: 优化,修理getActorPhoto()

This commit is contained in:
lededev
2021-10-18 19:52:42 +08:00
parent 5ef16e3a6d
commit 4428971135

View File

@@ -9,13 +9,11 @@ from WebCrawler.storyline import getStoryline
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): def getTitle(html):
html = etree.fromstring(a, etree.HTMLParser())
browser_title = str(html.xpath("/html/head/title/text()")[0]) browser_title = str(html.xpath("/html/head/title/text()")[0])
return browser_title[:browser_title.find(' | JavDB')].strip() return browser_title[:browser_title.find(' | JavDB')].strip()
def getActor(a): def getActor(html):
html = etree.fromstring(a, etree.HTMLParser())
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()') actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
r = [] r = []
@@ -32,8 +30,8 @@ def getActor(a):
idx = idx + 1 idx = idx + 1
return r return r
def getaphoto(url): def getaphoto(url, browser):
html_page = get_html(url) html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)') img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
img_url = img_prether.findall(html_page) img_url = img_prether.findall(html_page)
if img_url: if img_url:
@@ -41,24 +39,18 @@ def getaphoto(url):
else: else:
return '' return ''
def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
actorall_prether = re.compile(r'<strong>演員\:</strong>\s*?.*?<span class=\"value\">(.*)\s*?</div>') actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
actorall = actorall_prether.findall(html) if not actorall:
if actorall:
actoralls = actorall[0]
actor_prether = re.compile(r'<a href\=\"(.*?)\">(.*?)</a>')
actor = actor_prether.findall(actoralls)
actor_photo = {}
for i in actor:
actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0])
return actor_photo
else:
return {} return {}
a = getActor(html)
actor_photo = {}
for i in actorall:
if i.text in a:
actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
return actor_photo
def getStudio(a): def getStudio(a, html):
# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
@@ -70,25 +62,21 @@ def getStudio(a):
if len(result): if len(result):
return result return result
# 以卖家作为工作室 # 以卖家作为工作室
html = etree.fromstring(a, etree.HTMLParser())
try: try:
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
except: except:
result = '' result = ''
return result return result
def getRuntime(a): def getRuntime(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a): def getNum(html):
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+') return str(result2 + result1).strip('+')
@@ -118,8 +106,7 @@ def getRelease(a):
else: else:
result = '' result = ''
return result return result
def getTag(a): def getTag(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
total = [] total = []
@@ -140,11 +127,10 @@ def getTag(a):
pass pass
return total return total
def getCover_small(a, index=0): def getCover_small(html, index=0):
# same issue mentioned below, # same issue mentioned below,
# javdb sometime returns multiple results # javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number # DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result: if not 'https' in result:
@@ -175,23 +161,20 @@ def getTrailer(htmlcode): # 获取预告片
video_url = '' video_url = ''
return video_url return video_url
def getExtrafanart(htmlcode): # 获取剧照 def getExtrafanart(html): # 获取剧照
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = [] result = []
try: try:
result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href") result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
except: except:
pass pass
return result return result
def getCover(htmlcode): def getCover(html):
html = etree.fromstring(htmlcode, etree.HTMLParser())
try: try:
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
except: # 2020.7.17 Repair Cover Url crawl except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
return result return result
def getDirector(a): def getDirector(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
@@ -206,9 +189,7 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404函数暂时
return '' return ''
def getOutline(number, title): #获取剧情介绍 多进程并发查询 def getOutline(number, title): #获取剧情介绍 多进程并发查询
return getStoryline(number,title) return getStoryline(number,title)
def getSeries(a): def getSeries(html):
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
@@ -243,6 +224,7 @@ def main(number):
javdb_site = secrets.choice(javdb_sites) javdb_site = secrets.choice(javdb_sites)
if debug: if debug:
print(f'[!]javdb:select site {javdb_site}') print(f'[!]javdb:select site {javdb_site}')
browser = None
try: try:
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser') res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
@@ -277,52 +259,54 @@ def main(number):
except: except:
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
# etree.fromstring开销很大最好只用一次而它的xpath很快比bs4 find/select快可以多用
lx = etree.fromstring(detail_page, etree.HTMLParser())
# no cut image by default # no cut image by default
imagecut = 3 imagecut = 3
# If gray image exists ,then replace with normal cover # If gray image exists ,then replace with normal cover
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
cover_small = getCover_small(query_result) cover_small = getCover_small(html)
else: else:
try: try:
cover_small = getCover_small(query_result, index=ids.index(number)) cover_small = getCover_small(html, index=ids.index(number))
except: except:
# if input number is "STAR438" not "STAR-438", use first search result. # if input number is "STAR438" not "STAR-438", use first search result.
cover_small = getCover_small(query_result) cover_small = getCover_small(html)
if 'placeholder' in cover_small: if 'placeholder' in cover_small:
# replace wit normal cover and cut it # replace wit normal cover and cut it
imagecut = 1 imagecut = 1
cover_small = getCover(detail_page) cover_small = getCover(lx)
dp_number = getNum(detail_page) dp_number = getNum(lx)
if dp_number.upper() != number: if dp_number.upper() != number:
raise ValueError("number not found") raise ValueError("number not found")
title = getTitle(detail_page) title = getTitle(lx)
if title and dp_number: if title and dp_number:
number = dp_number number = dp_number
# remove duplicate title # remove duplicate title
title = title.replace(number, '').strip() title = title.replace(number, '').strip()
dic = { dic = {
'actor': getActor(detail_page), 'actor': getActor(lx),
'title': title, 'title': title,
'studio': getStudio(detail_page), 'studio': getStudio(detail_page, lx),
'outline': getOutline(number, title), 'outline': getOutline(number, title),
'runtime': getRuntime(detail_page), 'runtime': getRuntime(lx),
'director': getDirector(detail_page), 'director': getDirector(lx),
'release': getRelease(detail_page), 'release': getRelease(detail_page),
'number': number, 'number': number,
'cover': getCover(detail_page), 'cover': getCover(lx),
'cover_small': cover_small, 'cover_small': cover_small,
'trailer': getTrailer(detail_page), 'trailer': getTrailer(detail_page),
'extrafanart': getExtrafanart(detail_page), 'extrafanart': getExtrafanart(lx),
'imagecut': imagecut, 'imagecut': imagecut,
'tag': getTag(detail_page), 'tag': getTag(lx),
'label': getLabel(detail_page), 'label': getLabel(lx),
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
# 'actor_photo': getActorPhoto(detail_page), # 'actor_photo': getActorPhoto(lx, javdb_site, browser),
'website': 'https://javdb.com' + correct_url, 'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py', 'source': 'javdb.py',
'series': getSeries(detail_page), 'series': getSeries(lx),
} }
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A): if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
@@ -356,4 +340,5 @@ if __name__ == "__main__":
# print(main('EHM0001')) # not found # print(main('EHM0001')) # not found
# print(main('FC2-2314275')) # print(main('FC2-2314275'))
# print(main('EBOD-646')) # print(main('EBOD-646'))
print(main('LOVE-262')) # print(main('LOVE-262'))
print(main('ABP-890'))