From 7af0951b825eb1697f7821c8dc52cb9966ab6bc1 Mon Sep 17 00:00:00 2001 From: Feng4 Date: Sun, 20 Dec 2020 00:34:42 +0800 Subject: [PATCH 1/4] Update number_parser.py --- number_parser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/number_parser.py b/number_parser.py index 6e7f7b4..554d3d3 100644 --- a/number_parser.py +++ b/number_parser.py @@ -61,6 +61,11 @@ def get_number(debug,filepath: str) -> str: file_number = re.search(r'\w+-\w+', filename, re.A).group() return file_number else: # 提取不含减号-的番号,FANZA CID + # 欧美番号匹配规则 + oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) + if oumei: + return oumei.group() + try: return str( re.findall(r'(.+?)\.', @@ -72,4 +77,4 @@ def get_number(debug,filepath: str) -> str: # if __name__ == "__main__": # import doctest -# doctest.testmod(raise_on_error=True) \ No newline at end of file +# doctest.testmod(raise_on_error=True) From c94fcd47facc18942358434d3b25b0a49d065034 Mon Sep 17 00:00:00 2001 From: Feng4 Date: Sun, 20 Dec 2020 00:37:03 +0800 Subject: [PATCH 2/4] Update number_parser.py --- number_parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/number_parser.py b/number_parser.py index 554d3d3..025e2cf 100644 --- a/number_parser.py +++ b/number_parser.py @@ -41,6 +41,11 @@ def get_number(debug,filepath: str) -> str: file_number = re.search(r'\w+-\w+', filename, re.A).group() return file_number else: # 提取不含减号-的番号,FANZA CID + # 欧美番号匹配规则 + oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) + if oumei: + return oumei.group() + try: return str( re.findall(r'(.+?)\.', From fc4cc4c122785da7690f11465a5cd2bc301b0e2f Mon Sep 17 00:00:00 2001 From: Feng4 Date: Sun, 20 Dec 2020 00:42:58 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=AC=A7=E7=BE=8E?= =?UTF-8?q?=E7=9A=84=E5=88=AE=E5=89=8A=E5=88=A4=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javdb.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index eac8d7a..fa38bae 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -103,10 +103,17 @@ def getCover_small(a, index=0): result = 'https:' + result return result except: # 2020.7.17 Repair Cover Url crawl - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] - if not 'https' in result: - result = 'https:' + result - return result + try: + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] + if not 'https' in result: + result = 'https:' + result + return result + except: + result = html.xpath("//div[@class='item-image']/img/@data-src")[index] + if not 'https' in result: + result = 'https:' + result + return result + def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) try: @@ -141,14 +148,23 @@ def main(number): # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one urls = html.xpath('//*[@id="videos"]/div/div/a/@href') - ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') - correct_url = urls[ids.index(number)] + # 记录一下欧美的ids ['Blacked','Blacked'] + if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): + correct_url = urls[0] + else: + ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') + correct_url = urls[ids.index(number)] + detail_page = get_html('https://javdb.com' + correct_url) # no cut image by default imagecut = 3 # If gray image exists ,then replace with normal cover - cover_small = getCover_small(query_result, index=ids.index(number)) + if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): + cover_small = getCover_small(query_result) + else: + cover_small = getCover_small(query_result, index=ids.index(number)) + if 'placeholder' in cover_small: # replace wit normal cover and cut it imagecut = 1 From 23281a4a64417207d1f9c31f7bf8a3d1b06aae19 Mon Sep 17 00:00:00 2001 From: Feng4 Date: Sun, 20 Dec 2020 00:49:55 +0800 Subject: [PATCH 4/4] Update javdb.py --- WebCrawler/javdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index fa38bae..41a4775 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -160,7 +160,7 @@ def main(number): # no cut image by default imagecut = 3 # If gray image exists ,then replace with normal cover - if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): + if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): cover_small = getCover_small(query_result) else: cover_small = getCover_small(query_result, index=ids.index(number))