diff --git a/ADC_function.py b/ADC_function.py old mode 100644 new mode 100755 index de56eb0..7a23a52 --- a/ADC_function.py +++ b/ADC_function.py @@ -84,6 +84,25 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: print("[-]" + errors) +def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + configProxy = config.Config().proxy() + if configProxy.enable: + browser.session.proxies = configProxy.proxies() + result = browser.open(url) + if not result.ok: + return '' + result.encoding = "utf-8" + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "browser": + return result, browser + else: + return result.text + + def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) if isinstance(cookies, dict): @@ -592,4 +611,3 @@ def is_link(filename: str): elif os.stat(filename).st_nlink > 1: return True # hard link Linux MAC OSX Windows NTFS return False - diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index b173255..f4fa9c0 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -8,15 +8,16 @@ from ADC_function import * def main(number: str) -> json: try: - caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', - return_type="content") + caribbytes, browser = get_html_by_browser( + 'https://www.caribbeancom.com/moviepages/'+number+'/index.html', + return_type="browser") - caribhtml = caribbytes.decode("euc_jp") + if not caribbytes or not caribbytes.ok: + raise ValueError("page not found") - soup = BeautifulSoup(caribhtml, "html.parser") - lx = html.fromstring(str(soup)) + lx = html.fromstring(str(browser.page)) - if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): + if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): raise ValueError("page info not found") except Exception as e: if config.Config().debug(): @@ -27,7 +28,7 @@ def main(number: str) -> json: 'title': get_title(lx), 'studio': '加勒比', 'year': get_year(lx), - 'outline': '', + 'outline': get_outline(lx), 'runtime': get_runtime(lx), 'director': '', 'actor': get_actor(lx), @@ -36,12 +37,12 @@ def main(number: str) -> json: 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', 'tag': get_tag(lx), 'extrafanart': get_extrafanart(lx), - 'label': '', - 'imagecut': 0, - 'actor_photo': '', + 'label': get_series(lx), + 'imagecut': 1, + 'actor_photo': get_actor_photo(browser), 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', 'source': 'carib.py', - 'series': '', + 'series': get_series(lx), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) return js @@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] +def get_outline(lx: html.HtmlElement) -> str: + return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() + def get_release(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') -def get_actor(lx: html.HtmlElement) -> str: +def get_actor(lx: html.HtmlElement): r = [] actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()") for act in actors: @@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str: r.append('https://www.caribbeancom.com' + jpg) return r +def get_series(lx: html.HtmlElement) -> str: + try: + return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip() + except: + return '' + def get_runtime(lx: html.HtmlElement) -> str: return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() +def get_actor_photo(browser): + htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a') + t = {} + for a in htmla: + if a.text.strip() == '他': + continue + p = {a.text.strip(): a['href']} + t.update(p) + o = {} + for k, v in t.items(): + if '/search_act/' not in v: + continue + r = browser.open_relative(v) + if not r.ok: + continue + html = browser.page.prettify() + pos = html.find('.full-bg') + if pos<0: + continue + css = html[pos:pos+100] + p0 = css.find('background: url(') + p1 = css.find('.jpg)') + if p0<0 or p1<0: + continue + p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])} + o.update(p) + return o + if __name__ == "__main__": + print(main("070116-197")) # actor have photo print(main("041721-001")) print(main("080520-001"))