WebCrawler:全面换装getInstance(),厘清airav.py与javbus.py及javdb.py的相爱相杀
This commit is contained in:
@@ -1,51 +1,53 @@
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import html
|
||||
import re
|
||||
from ADC_function import *
|
||||
|
||||
def main(number: str) -> json:
|
||||
try:
|
||||
caribbytes, browser = get_html_by_browser(
|
||||
carib_obj, browser = get_html_by_browser(
|
||||
'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
||||
return_type="browser")
|
||||
|
||||
if not caribbytes or not caribbytes.ok:
|
||||
if not carib_obj or not carib_obj.ok:
|
||||
raise ValueError("page not found")
|
||||
|
||||
lx = html.fromstring(str(browser.page))
|
||||
|
||||
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
|
||||
raise ValueError("page info not found")
|
||||
|
||||
dic = {
|
||||
'title': get_title(lx),
|
||||
'studio': '加勒比',
|
||||
'year': get_year(lx),
|
||||
'outline': get_outline(lx),
|
||||
'runtime': get_runtime(lx),
|
||||
'director': '',
|
||||
'actor': get_actor(lx),
|
||||
'release': get_release(lx),
|
||||
'number': number,
|
||||
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
|
||||
'tag': get_tag(lx),
|
||||
'extrafanart': get_extrafanart(lx),
|
||||
'label': get_series(lx),
|
||||
'imagecut': 1,
|
||||
# 'actor_photo': get_actor_photo(browser),
|
||||
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
|
||||
'source': 'carib.py',
|
||||
'series': get_series(lx),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
||||
return js
|
||||
|
||||
except Exception as e:
|
||||
if config.Config().debug():
|
||||
if config.getInstance().debug():
|
||||
print(e)
|
||||
dic = {"title": ""}
|
||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
||||
dic = {
|
||||
'title': get_title(lx),
|
||||
'studio': '加勒比',
|
||||
'year': get_year(lx),
|
||||
'outline': get_outline(lx),
|
||||
'runtime': get_runtime(lx),
|
||||
'director': '',
|
||||
'actor': get_actor(lx),
|
||||
'release': get_release(lx),
|
||||
'number': number,
|
||||
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
|
||||
'tag': get_tag(lx),
|
||||
'extrafanart': get_extrafanart(lx),
|
||||
'label': get_series(lx),
|
||||
'imagecut': 1,
|
||||
# 'actor_photo': get_actor_photo(browser),
|
||||
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
|
||||
'source': 'carib.py',
|
||||
'series': get_series(lx),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
||||
return js
|
||||
|
||||
|
||||
def get_title(lx: html.HtmlElement) -> str:
|
||||
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
|
||||
@@ -114,11 +116,10 @@ def get_actor_photo(browser):
|
||||
if pos<0:
|
||||
continue
|
||||
css = html[pos:pos+100]
|
||||
p0 = css.find('background: url(')
|
||||
p1 = css.find('.jpg)')
|
||||
if p0<0 or p1<0:
|
||||
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
|
||||
if not cssBGjpgs or not len(cssBGjpgs[0]):
|
||||
continue
|
||||
p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
|
||||
p = {k: urljoin(browser.url, cssBGjpgs[0])}
|
||||
o.update(p)
|
||||
return o
|
||||
|
||||
|
||||
Reference in New Issue
Block a user