diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index 0d88279..c576d3f 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -64,6 +64,9 @@ class Carib(Parser): r.append('https://www.caribbeancom.com' + jpg) return r + def getTrailer(self, htmltree): + return f'https://smovie.caribbeancom.com/sample/movies/{self.number}/1080p.mp4' + def getActorPhoto(self, htmltree): htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 2706b91..858115a 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -11,8 +11,8 @@ class Fanza(Parser): expr_title = '//*[starts-with(@id, "title")]/text()' expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" - expr_cover = './/head/meta[@property="og:image"]/@content' - expr_extrafanart = '//a[@name="sample-image"]/img/@src' + # expr_cover = './/head/meta[@property="og:image"]/@content' + # expr_extrafanart = '//a[@name="sample-image"]/img/@src' expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" expr_outline_og = '//head/meta[@property="og:description"]/@content' @@ -119,6 +119,48 @@ class Fanza(Parser): if ret == "----": return '' return ret + + def getCover(self, htmltree): + cover_number = self.number + try: + result = htmltree.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # sometimes fanza modify _ to \u0005f for image id + if "_" in cover_number: + cover_number = cover_number.replace("_", r"\u005f") + try: + result = htmltree.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # (TODO) handle more edge case + # print(html) + # raise exception here, same behavior as before + # people's major requirement is fetching the picture + raise ValueError("can not find image") + return result + + def getExtrafanart(self, htmltree): + htmltext = re.search(r'
\s*?', self.htmlcode) + if htmltext: + htmltext = htmltext.group() + extrafanart_images = re.findall(r'