Merge pull request #864 from WarpTraveller/master

Update fanza.py
2022-09-02 20:33:09 +08:00
parent c5c55be846 4502167216
commit 2bb50f4a47
1 changed files with 44 additions and 2 deletions
@@ -11,8 +11,8 @@ class Fanza(Parser):
    expr_title = '//*[starts-with(@id, "title")]/text()'
    expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
-    expr_cover = './/head/meta[@property="og:image"]/@content'
+    # expr_cover = './/head/meta[@property="og:image"]/@content'
-    expr_extrafanart = '//a[@name="sample-image"]/img/@src'
+    # expr_extrafanart = '//a[@name="sample-image"]/img/@src'
    expr_outline = "//div[@class='mg-b20 lh4']/text()"
    expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
    expr_outline_og = '//head/meta[@property="og:description"]/@content'
@@ -119,6 +119,48 @@ class Fanza(Parser):
        if ret == "----":
            return ''
        return ret
    def getCover(self, htmltree):
        cover_number = self.number
        try:
            result = htmltree.xpath('//*[@id="' + cover_number + '"]/@href')[0]
        except:
            # sometimes fanza modify _ to \u0005f for image id
            if "_" in cover_number:
                cover_number = cover_number.replace("_", r"\u005f")
            try:
                result = htmltree.xpath('//*[@id="' + cover_number + '"]/@href')[0]
            except:
                # (TODO) handle more edge case
                # print(html)
                # raise exception here, same behavior as before
                # people's major requirement is fetching the picture
                raise ValueError("can not find image")
        return result
    def getExtrafanart(self, htmltree):
        htmltext = re.search(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\s*?</div>', self.htmlcode)
        if htmltext:
            htmltext = htmltext.group()
            extrafanart_images = re.findall(r'<img.*?src=\"(.*?)\"', htmltext)
            if extrafanart_images:
                sheet = []
                for img_url in extrafanart_images:
                    url_cuts = img_url.rsplit('-', 1)
                    sheet.append(url_cuts[0] + 'jp-' + url_cuts[1])
                return sheet
        return ''
    def getTrailer(self, htmltree):
        htmltext = re.search(r'<script type=\"application/ld\+json\">[\s\S].*}\s*?</script>', self.htmlcode)
        if htmltext:
            htmltext = htmltext.group()
            url = re.search(r'\"contentUrl\":\"(.*?)\"', htmltext)
            if url:
                url = url.group(1)
                url = url.rsplit('_', 2)[0] + '_mhb_w.mp4'
                return url
        return ''
    def getFanzaString(self, expr):
        result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']")