AV_Data_Capture/WebCrawler/carib.py

import sys
sys.path.append('../')
import json
from bs4 import BeautifulSoup
from lxml import html
import re
from ADC_function import *

def main(number: str) -> json:
    try:
        caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
                             return_type="content")

        caribhtml = caribbytes.decode("euc_jp")

        soup = BeautifulSoup(caribhtml, "html.parser")
        lx = html.fromstring(str(soup))

        if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
            raise ValueError("page info not found")
    except:
        dic = {"title": ""}
        return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
    dic = {
        'title': get_title(lx),
        'studio': '加勒比',
        'year': get_year(lx),
        'outline': '',
        'runtime': get_runtime(lx),
        'director': '',
        'actor': get_actor(lx),
        'release': get_release(lx),
        'number': number,
        'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
        'tag': get_tag(lx),
        'extrafanart': get_extrafanart(lx),
        'label': '',
        'imagecut': 0,
        'actor_photo': '',
        'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
        'source': 'carib.py',
        'series': '',
    }
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
    return js

def get_title(lx: html.HtmlElement) -> str:
    return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()

def get_year(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]

def get_release(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')

def get_actor(lx: html.HtmlElement) -> str:
    r = []
    actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
    for act in actors:
        if str(act) != '他':
            r.append(act)
    return r

def get_tag(lx: html.HtmlElement) -> str:
    r = []
    genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
    for g in genres:
        r.append(translateTag_to_sc(str(g)))
    return r

def get_extrafanart(lx: html.HtmlElement) -> str:
    r = []
    genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
    for g in genres:
        jpg = str(g)
        if '/member/' in jpg:
            break
        else:
            r.append('https://www.caribbeancom.com' + jpg)
    return r

def get_runtime(lx: html.HtmlElement) -> str:
    return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()

if __name__ == "__main__":
    print(main("041721-001"))
    print(main("080520-001"))