carib.py: add outline/series/actor_photo
This commit is contained in:
20
ADC_function.py
Normal file → Executable file
20
ADC_function.py
Normal file → Executable file
@@ -84,6 +84,25 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
|||||||
print("[-]" + errors)
|
print("[-]" + errors)
|
||||||
|
|
||||||
|
|
||||||
|
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
|
||||||
|
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
|
||||||
|
configProxy = config.Config().proxy()
|
||||||
|
if configProxy.enable:
|
||||||
|
browser.session.proxies = configProxy.proxies()
|
||||||
|
result = browser.open(url)
|
||||||
|
if not result.ok:
|
||||||
|
return ''
|
||||||
|
result.encoding = "utf-8"
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
elif return_type == "browser":
|
||||||
|
return result, browser
|
||||||
|
else:
|
||||||
|
return result.text
|
||||||
|
|
||||||
|
|
||||||
def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
|
def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
|
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
|
||||||
if isinstance(cookies, dict):
|
if isinstance(cookies, dict):
|
||||||
@@ -592,4 +611,3 @@ def is_link(filename: str):
|
|||||||
elif os.stat(filename).st_nlink > 1:
|
elif os.stat(filename).st_nlink > 1:
|
||||||
return True # hard link Linux MAC OSX Windows NTFS
|
return True # hard link Linux MAC OSX Windows NTFS
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
@@ -8,15 +8,16 @@ from ADC_function import *
|
|||||||
|
|
||||||
def main(number: str) -> json:
|
def main(number: str) -> json:
|
||||||
try:
|
try:
|
||||||
caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
caribbytes, browser = get_html_by_browser(
|
||||||
return_type="content")
|
'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
||||||
|
return_type="browser")
|
||||||
|
|
||||||
caribhtml = caribbytes.decode("euc_jp")
|
if not caribbytes or not caribbytes.ok:
|
||||||
|
raise ValueError("page not found")
|
||||||
|
|
||||||
soup = BeautifulSoup(caribhtml, "html.parser")
|
lx = html.fromstring(str(browser.page))
|
||||||
lx = html.fromstring(str(soup))
|
|
||||||
|
|
||||||
if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
|
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
|
||||||
raise ValueError("page info not found")
|
raise ValueError("page info not found")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if config.Config().debug():
|
if config.Config().debug():
|
||||||
@@ -27,7 +28,7 @@ def main(number: str) -> json:
|
|||||||
'title': get_title(lx),
|
'title': get_title(lx),
|
||||||
'studio': '加勒比',
|
'studio': '加勒比',
|
||||||
'year': get_year(lx),
|
'year': get_year(lx),
|
||||||
'outline': '',
|
'outline': get_outline(lx),
|
||||||
'runtime': get_runtime(lx),
|
'runtime': get_runtime(lx),
|
||||||
'director': '',
|
'director': '',
|
||||||
'actor': get_actor(lx),
|
'actor': get_actor(lx),
|
||||||
@@ -36,12 +37,12 @@ def main(number: str) -> json:
|
|||||||
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
|
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
|
||||||
'tag': get_tag(lx),
|
'tag': get_tag(lx),
|
||||||
'extrafanart': get_extrafanart(lx),
|
'extrafanart': get_extrafanart(lx),
|
||||||
'label': '',
|
'label': get_series(lx),
|
||||||
'imagecut': 0,
|
'imagecut': 1,
|
||||||
'actor_photo': '',
|
'actor_photo': get_actor_photo(browser),
|
||||||
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
|
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
|
||||||
'source': 'carib.py',
|
'source': 'carib.py',
|
||||||
'series': '',
|
'series': get_series(lx),
|
||||||
}
|
}
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
||||||
return js
|
return js
|
||||||
@@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str:
|
|||||||
def get_year(lx: html.HtmlElement) -> str:
|
def get_year(lx: html.HtmlElement) -> str:
|
||||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
|
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
|
||||||
|
|
||||||
|
def get_outline(lx: html.HtmlElement) -> str:
|
||||||
|
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
|
||||||
|
|
||||||
def get_release(lx: html.HtmlElement) -> str:
|
def get_release(lx: html.HtmlElement) -> str:
|
||||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
|
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
|
||||||
|
|
||||||
def get_actor(lx: html.HtmlElement) -> str:
|
def get_actor(lx: html.HtmlElement):
|
||||||
r = []
|
r = []
|
||||||
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
|
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
|
||||||
for act in actors:
|
for act in actors:
|
||||||
@@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str:
|
|||||||
r.append('https://www.caribbeancom.com' + jpg)
|
r.append('https://www.caribbeancom.com' + jpg)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
def get_series(lx: html.HtmlElement) -> str:
|
||||||
|
try:
|
||||||
|
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
def get_runtime(lx: html.HtmlElement) -> str:
|
def get_runtime(lx: html.HtmlElement) -> str:
|
||||||
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
|
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
|
||||||
|
|
||||||
|
def get_actor_photo(browser):
|
||||||
|
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
|
||||||
|
t = {}
|
||||||
|
for a in htmla:
|
||||||
|
if a.text.strip() == '他':
|
||||||
|
continue
|
||||||
|
p = {a.text.strip(): a['href']}
|
||||||
|
t.update(p)
|
||||||
|
o = {}
|
||||||
|
for k, v in t.items():
|
||||||
|
if '/search_act/' not in v:
|
||||||
|
continue
|
||||||
|
r = browser.open_relative(v)
|
||||||
|
if not r.ok:
|
||||||
|
continue
|
||||||
|
html = browser.page.prettify()
|
||||||
|
pos = html.find('.full-bg')
|
||||||
|
if pos<0:
|
||||||
|
continue
|
||||||
|
css = html[pos:pos+100]
|
||||||
|
p0 = css.find('background: url(')
|
||||||
|
p1 = css.find('.jpg)')
|
||||||
|
if p0<0 or p1<0:
|
||||||
|
continue
|
||||||
|
p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
|
||||||
|
o.update(p)
|
||||||
|
return o
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
print(main("070116-197")) # actor have photo
|
||||||
print(main("041721-001"))
|
print(main("041721-001"))
|
||||||
print(main("080520-001"))
|
print(main("080520-001"))
|
||||||
|
|||||||
Reference in New Issue
Block a user