carib.py: add outline/series/actor_photo

This commit is contained in:
lededev
2021-09-23 15:45:00 +08:00
parent 5e0e8b9cea
commit 50574a705b
2 changed files with 70 additions and 13 deletions

View File

@@ -8,15 +8,16 @@ from ADC_function import *
def main(number: str) -> json:
try:
caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
return_type="content")
caribbytes, browser = get_html_by_browser(
'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
return_type="browser")
caribhtml = caribbytes.decode("euc_jp")
if not caribbytes or not caribbytes.ok:
raise ValueError("page not found")
soup = BeautifulSoup(caribhtml, "html.parser")
lx = html.fromstring(str(soup))
lx = html.fromstring(str(browser.page))
if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
raise ValueError("page info not found")
except Exception as e:
if config.Config().debug():
@@ -27,7 +28,7 @@ def main(number: str) -> json:
'title': get_title(lx),
'studio': '加勒比',
'year': get_year(lx),
'outline': '',
'outline': get_outline(lx),
'runtime': get_runtime(lx),
'director': '',
'actor': get_actor(lx),
@@ -36,12 +37,12 @@ def main(number: str) -> json:
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
'tag': get_tag(lx),
'extrafanart': get_extrafanart(lx),
'label': '',
'imagecut': 0,
'actor_photo': '',
'label': get_series(lx),
'imagecut': 1,
'actor_photo': get_actor_photo(browser),
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
'source': 'carib.py',
'series': '',
'series': get_series(lx),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
@@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str:
def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
def get_actor(lx: html.HtmlElement) -> str:
def get_actor(lx: html.HtmlElement):
r = []
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
for act in actors:
@@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str:
r.append('https://www.caribbeancom.com' + jpg)
return r
def get_series(lx: html.HtmlElement) -> str:
try:
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
except:
return ''
def get_runtime(lx: html.HtmlElement) -> str:
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
def get_actor_photo(browser):
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
t = {}
for a in htmla:
if a.text.strip() == '':
continue
p = {a.text.strip(): a['href']}
t.update(p)
o = {}
for k, v in t.items():
if '/search_act/' not in v:
continue
r = browser.open_relative(v)
if not r.ok:
continue
html = browser.page.prettify()
pos = html.find('.full-bg')
if pos<0:
continue
css = html[pos:pos+100]
p0 = css.find('background: url(')
p1 = css.find('.jpg)')
if p0<0 or p1<0:
continue
p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
o.update(p)
return o
if __name__ == "__main__":
print(main("070116-197")) # actor have photo
print(main("041721-001"))
print(main("080520-001"))