javdb.py: 优化,修理getActorPhoto()
This commit is contained in:
@@ -9,13 +9,11 @@ from WebCrawler.storyline import getStoryline
|
|||||||
# import io
|
# import io
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||||
|
|
||||||
def getTitle(a):
|
def getTitle(html):
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
browser_title = str(html.xpath("/html/head/title/text()")[0])
|
browser_title = str(html.xpath("/html/head/title/text()")[0])
|
||||||
return browser_title[:browser_title.find(' | JavDB')].strip()
|
return browser_title[:browser_title.find(' | JavDB')].strip()
|
||||||
|
|
||||||
def getActor(a):
|
def getActor(html):
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
|
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
|
||||||
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
|
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
|
||||||
r = []
|
r = []
|
||||||
@@ -32,8 +30,8 @@ def getActor(a):
|
|||||||
idx = idx + 1
|
idx = idx + 1
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def getaphoto(url):
|
def getaphoto(url, browser):
|
||||||
html_page = get_html(url)
|
html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
|
||||||
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
|
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
|
||||||
img_url = img_prether.findall(html_page)
|
img_url = img_prether.findall(html_page)
|
||||||
if img_url:
|
if img_url:
|
||||||
@@ -41,24 +39,18 @@ def getaphoto(url):
|
|||||||
else:
|
else:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img
|
def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
|
||||||
actorall_prether = re.compile(r'<strong>演員\:</strong>\s*?.*?<span class=\"value\">(.*)\s*?</div>')
|
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
|
||||||
actorall = actorall_prether.findall(html)
|
if not actorall:
|
||||||
|
|
||||||
if actorall:
|
|
||||||
actoralls = actorall[0]
|
|
||||||
actor_prether = re.compile(r'<a href\=\"(.*?)\">(.*?)</a>')
|
|
||||||
actor = actor_prether.findall(actoralls)
|
|
||||||
actor_photo = {}
|
|
||||||
for i in actor:
|
|
||||||
actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0])
|
|
||||||
|
|
||||||
return actor_photo
|
|
||||||
|
|
||||||
else:
|
|
||||||
return {}
|
return {}
|
||||||
|
a = getActor(html)
|
||||||
|
actor_photo = {}
|
||||||
|
for i in actorall:
|
||||||
|
if i.text in a:
|
||||||
|
actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
|
||||||
|
return actor_photo
|
||||||
|
|
||||||
def getStudio(a):
|
def getStudio(a, html):
|
||||||
# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
|
# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
|
||||||
# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
|
# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
|
||||||
@@ -70,25 +62,21 @@ def getStudio(a):
|
|||||||
if len(result):
|
if len(result):
|
||||||
return result
|
return result
|
||||||
# 以卖家作为工作室
|
# 以卖家作为工作室
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
try:
|
try:
|
||||||
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
|
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
result = ''
|
result = ''
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def getRuntime(a):
|
def getRuntime(html):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
||||||
return str(result1 + result2).strip('+').rstrip('mi')
|
return str(result1 + result2).strip('+').rstrip('mi')
|
||||||
def getLabel(a):
|
def getLabel(html):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
def getNum(a):
|
def getNum(html):
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
|
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
|
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
|
||||||
return str(result2 + result1).strip('+')
|
return str(result2 + result1).strip('+')
|
||||||
@@ -118,8 +106,7 @@ def getRelease(a):
|
|||||||
else:
|
else:
|
||||||
result = ''
|
result = ''
|
||||||
return result
|
return result
|
||||||
def getTag(a):
|
def getTag(html):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
try:
|
||||||
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
|
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
|
||||||
total = []
|
total = []
|
||||||
@@ -140,11 +127,10 @@ def getTag(a):
|
|||||||
pass
|
pass
|
||||||
return total
|
return total
|
||||||
|
|
||||||
def getCover_small(a, index=0):
|
def getCover_small(html, index=0):
|
||||||
# same issue mentioned below,
|
# same issue mentioned below,
|
||||||
# javdb sometime returns multiple results
|
# javdb sometime returns multiple results
|
||||||
# DO NOT just get the firt one, get the one with correct index number
|
# DO NOT just get the firt one, get the one with correct index number
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
try:
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
||||||
if not 'https' in result:
|
if not 'https' in result:
|
||||||
@@ -175,23 +161,20 @@ def getTrailer(htmlcode): # 获取预告片
|
|||||||
video_url = ''
|
video_url = ''
|
||||||
return video_url
|
return video_url
|
||||||
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
def getExtrafanart(html): # 获取剧照
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = []
|
result = []
|
||||||
try:
|
try:
|
||||||
result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
|
result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return result
|
return result
|
||||||
def getCover(htmlcode):
|
def getCover(html):
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
try:
|
||||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
|
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
|
||||||
except: # 2020.7.17 Repair Cover Url crawl
|
except: # 2020.7.17 Repair Cover Url crawl
|
||||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
|
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
|
||||||
return result
|
return result
|
||||||
def getDirector(a):
|
def getDirector(html):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
@@ -206,9 +189,7 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时
|
|||||||
return ''
|
return ''
|
||||||
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
||||||
return getStoryline(number,title)
|
return getStoryline(number,title)
|
||||||
def getSeries(a):
|
def getSeries(html):
|
||||||
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
@@ -243,6 +224,7 @@ def main(number):
|
|||||||
javdb_site = secrets.choice(javdb_sites)
|
javdb_site = secrets.choice(javdb_sites)
|
||||||
if debug:
|
if debug:
|
||||||
print(f'[!]javdb:select site {javdb_site}')
|
print(f'[!]javdb:select site {javdb_site}')
|
||||||
|
browser = None
|
||||||
try:
|
try:
|
||||||
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
|
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
|
||||||
res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
|
res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
|
||||||
@@ -277,52 +259,54 @@ def main(number):
|
|||||||
except:
|
except:
|
||||||
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
|
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
|
||||||
|
|
||||||
|
# etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用
|
||||||
|
lx = etree.fromstring(detail_page, etree.HTMLParser())
|
||||||
# no cut image by default
|
# no cut image by default
|
||||||
imagecut = 3
|
imagecut = 3
|
||||||
# If gray image exists ,then replace with normal cover
|
# If gray image exists ,then replace with normal cover
|
||||||
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
|
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
|
||||||
cover_small = getCover_small(query_result)
|
cover_small = getCover_small(html)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
cover_small = getCover_small(query_result, index=ids.index(number))
|
cover_small = getCover_small(html, index=ids.index(number))
|
||||||
except:
|
except:
|
||||||
# if input number is "STAR438" not "STAR-438", use first search result.
|
# if input number is "STAR438" not "STAR-438", use first search result.
|
||||||
cover_small = getCover_small(query_result)
|
cover_small = getCover_small(html)
|
||||||
if 'placeholder' in cover_small:
|
if 'placeholder' in cover_small:
|
||||||
# replace wit normal cover and cut it
|
# replace wit normal cover and cut it
|
||||||
imagecut = 1
|
imagecut = 1
|
||||||
cover_small = getCover(detail_page)
|
cover_small = getCover(lx)
|
||||||
|
|
||||||
dp_number = getNum(detail_page)
|
dp_number = getNum(lx)
|
||||||
if dp_number.upper() != number:
|
if dp_number.upper() != number:
|
||||||
raise ValueError("number not found")
|
raise ValueError("number not found")
|
||||||
title = getTitle(detail_page)
|
title = getTitle(lx)
|
||||||
if title and dp_number:
|
if title and dp_number:
|
||||||
number = dp_number
|
number = dp_number
|
||||||
# remove duplicate title
|
# remove duplicate title
|
||||||
title = title.replace(number, '').strip()
|
title = title.replace(number, '').strip()
|
||||||
|
|
||||||
dic = {
|
dic = {
|
||||||
'actor': getActor(detail_page),
|
'actor': getActor(lx),
|
||||||
'title': title,
|
'title': title,
|
||||||
'studio': getStudio(detail_page),
|
'studio': getStudio(detail_page, lx),
|
||||||
'outline': getOutline(number, title),
|
'outline': getOutline(number, title),
|
||||||
'runtime': getRuntime(detail_page),
|
'runtime': getRuntime(lx),
|
||||||
'director': getDirector(detail_page),
|
'director': getDirector(lx),
|
||||||
'release': getRelease(detail_page),
|
'release': getRelease(detail_page),
|
||||||
'number': number,
|
'number': number,
|
||||||
'cover': getCover(detail_page),
|
'cover': getCover(lx),
|
||||||
'cover_small': cover_small,
|
'cover_small': cover_small,
|
||||||
'trailer': getTrailer(detail_page),
|
'trailer': getTrailer(detail_page),
|
||||||
'extrafanart': getExtrafanart(detail_page),
|
'extrafanart': getExtrafanart(lx),
|
||||||
'imagecut': imagecut,
|
'imagecut': imagecut,
|
||||||
'tag': getTag(detail_page),
|
'tag': getTag(lx),
|
||||||
'label': getLabel(detail_page),
|
'label': getLabel(lx),
|
||||||
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
|
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||||
# 'actor_photo': getActorPhoto(detail_page),
|
# 'actor_photo': getActorPhoto(lx, javdb_site, browser),
|
||||||
'website': 'https://javdb.com' + correct_url,
|
'website': 'https://javdb.com' + correct_url,
|
||||||
'source': 'javdb.py',
|
'source': 'javdb.py',
|
||||||
'series': getSeries(detail_page),
|
'series': getSeries(lx),
|
||||||
|
|
||||||
}
|
}
|
||||||
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
|
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
|
||||||
@@ -356,4 +340,5 @@ if __name__ == "__main__":
|
|||||||
# print(main('EHM0001')) # not found
|
# print(main('EHM0001')) # not found
|
||||||
# print(main('FC2-2314275'))
|
# print(main('FC2-2314275'))
|
||||||
# print(main('EBOD-646'))
|
# print(main('EBOD-646'))
|
||||||
print(main('LOVE-262'))
|
# print(main('LOVE-262'))
|
||||||
|
print(main('ABP-890'))
|
||||||
|
|||||||
Reference in New Issue
Block a user