From 75aedf8601e39fded2404974909ce34d72080d5a Mon Sep 17 00:00:00 2001 From: "Wayne.S.Lui" Date: Tue, 30 Aug 2022 22:46:40 +0800 Subject: [PATCH 1/6] Update fanza.py Update to get trailer, extrafanarts and cover, using the method from older version --- scrapinglib/fanza.py | 54 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 2706b91..0ccf285 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -11,8 +11,8 @@ class Fanza(Parser): expr_title = '//*[starts-with(@id, "title")]/text()' expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" - expr_cover = './/head/meta[@property="og:image"]/@content' - expr_extrafanart = '//a[@name="sample-image"]/img/@src' + # expr_cover = './/head/meta[@property="og:image"]/@content' + # expr_extrafanart = '//a[@name="sample-image"]/img/@src' expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" expr_outline_og = '//head/meta[@property="og:description"]/@content' @@ -119,6 +119,56 @@ class Fanza(Parser): if ret == "----": return '' return ret + + def getCover(self, htmltree): + cover_number = self.number + try: + result = htmltree.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # sometimes fanza modify _ to \u0005f for image id + if "_" in cover_number: + cover_number = cover_number.replace("_", r"\u005f") + try: + result = htmltree.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # (TODO) handle more edge case + # print(html) + # raise exception here, same behavior as before + # people's major requirement is fetching the picture + raise ValueError("can not find image") + return result + + def getExtrafanart(self, htmltree): + html_parent = re.compile(r'
\s*?') + html = html_parent.search( + self.htmlcode) + if html: + html = html.group() + extrafanart_parent = re.compile(r'[\s\S].*}\s*?') + html = html_parent.search( + self.htmlcode) + if html: + html = html.group() + trailer_parent = re.compile(r'\"contentUrl\":\"(.*?)\"') + trailer_url = trailer_parent.search(html) + if trailer_url: + trailer_url = trailer_url.group(1) + trailer_cuts = trailer_url.rsplit('_', 2) + trailer_url = trailer_cuts[0] + '_mhb_w.mp4' + return trailer_url + return '' def getFanzaString(self, expr): result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']") From 7af659bda701c45b19fa5dfdacb651710d698d0e Mon Sep 17 00:00:00 2001 From: "Wayne.S.Lui" Date: Tue, 30 Aug 2022 22:56:54 +0800 Subject: [PATCH 2/6] Update fanza.py --- scrapinglib/fanza.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 0ccf285..698a888 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -140,8 +140,7 @@ class Fanza(Parser): def getExtrafanart(self, htmltree): html_parent = re.compile(r'
\s*?') - html = html_parent.search( - self.htmlcode) + html = html_parent.search(self.htmlcode) if html: html = html.group() extrafanart_parent = re.compile(r'[\s\S].*}\s*?') - html = html_parent.search( - self.htmlcode) + html = html_parent.search(self.htmlcode) if html: html = html.group() trailer_parent = re.compile(r'\"contentUrl\":\"(.*?)\"') From f848a4ec244425a5a09acd009252709e64bf2595 Mon Sep 17 00:00:00 2001 From: "Wayne.S.Lui" Date: Tue, 30 Aug 2022 22:56:54 +0800 Subject: [PATCH 3/6] Update fanza.py --- scrapinglib/fanza.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 0ccf285..5ccd0b3 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -139,35 +139,24 @@ class Fanza(Parser): return result def getExtrafanart(self, htmltree): - html_parent = re.compile(r'
\s*?') - html = html_parent.search( - self.htmlcode) - if html: - html = html.group() - extrafanart_parent = re.compile(r'\s*?', self.htmlcode).group() + if htmltext: + extrafanart_images = re.findall(r'[\s\S].*}\s*?') - html = html_parent.search( - self.htmlcode) - if html: - html = html.group() - trailer_parent = re.compile(r'\"contentUrl\":\"(.*?)\"') - trailer_url = trailer_parent.search(html) - if trailer_url: - trailer_url = trailer_url.group(1) - trailer_cuts = trailer_url.rsplit('_', 2) - trailer_url = trailer_cuts[0] + '_mhb_w.mp4' - return trailer_url + htmltext = re.search(r'', self.htmlcode).group() + if htmltext: + url = re.search(r'\"contentUrl\":\"(.*?)\"', htmltext).group(1) + if url: + url = url.rsplit('_', 2)[0] + '_mhb_w.mp4' + return url return '' def getFanzaString(self, expr): From 45021672166b750df0431b39dbadf93ab9b7eee1 Mon Sep 17 00:00:00 2001 From: "Wayne.S.Lui" Date: Wed, 31 Aug 2022 14:17:52 +0800 Subject: [PATCH 4/6] Merge remote-tracking branch 'origin/master' # Conflicts: # scrapinglib/fanza.py --- scrapinglib/fanza.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 5ccd0b3..858115a 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -139,8 +139,9 @@ class Fanza(Parser): return result def getExtrafanart(self, htmltree): - htmltext = re.search(r'
\s*?', self.htmlcode).group() + htmltext = re.search(r'
\s*?', self.htmlcode) if htmltext: + htmltext = htmltext.group() extrafanart_images = re.findall(r'[\s\S].*}\s*?', self.htmlcode).group() + htmltext = re.search(r'', self.htmlcode) if htmltext: - url = re.search(r'\"contentUrl\":\"(.*?)\"', htmltext).group(1) + htmltext = htmltext.group() + url = re.search(r'\"contentUrl\":\"(.*?)\"', htmltext) if url: + url = url.group(1) url = url.rsplit('_', 2)[0] + '_mhb_w.mp4' return url return '' From 6cb4be22aec00eb4dcd82481bb0daf38b27e687f Mon Sep 17 00:00:00 2001 From: aedvoan Date: Sun, 4 Sep 2022 23:14:38 +0800 Subject: [PATCH 5/6] Update javbus.py fanbus.us already deco, removed to prevent void retry. --- scrapinglib/javbus.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index eb559c0..cf3122e 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -42,7 +42,6 @@ class Javbus(Parser): 'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun', 'cdnbus.fun', 'dmmbus.fun', 'dmmsee.fun', - 'fanbus.us', 'seedmm.fun', ]) + "/" try: From 2a62b59346fbc2d9250225e7de26ffa968c7f1dd Mon Sep 17 00:00:00 2001 From: "Wayne.Lui" Date: Tue, 6 Sep 2022 12:46:03 +0800 Subject: [PATCH 6/6] Update carib.py to download trailer --- scrapinglib/carib.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index 0a561de..567ec57 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -63,6 +63,9 @@ class Carib(Parser): r.append('https://www.caribbeancom.com' + jpg) return r + def getTrailer(self, htmltree): + return f'https://smovie.caribbeancom.com/sample/movies/{self.number}/1080p.mp4' + def getActorPhoto(self, htmltree): htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")