diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index eac8d7a..41a4775 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -103,10 +103,17 @@ def getCover_small(a, index=0): result = 'https:' + result return result except: # 2020.7.17 Repair Cover Url crawl - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] - if not 'https' in result: - result = 'https:' + result - return result + try: + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] + if not 'https' in result: + result = 'https:' + result + return result + except: + result = html.xpath("//div[@class='item-image']/img/@data-src")[index] + if not 'https' in result: + result = 'https:' + result + return result + def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) try: @@ -141,14 +148,23 @@ def main(number): # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one urls = html.xpath('//*[@id="videos"]/div/div/a/@href') - ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') - correct_url = urls[ids.index(number)] + # 记录一下欧美的ids ['Blacked','Blacked'] + if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): + correct_url = urls[0] + else: + ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') + correct_url = urls[ids.index(number)] + detail_page = get_html('https://javdb.com' + correct_url) # no cut image by default imagecut = 3 # If gray image exists ,then replace with normal cover - cover_small = getCover_small(query_result, index=ids.index(number)) + if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): + cover_small = getCover_small(query_result) + else: + cover_small = getCover_small(query_result, index=ids.index(number)) + if 'placeholder' in cover_small: # replace wit normal cover and cut it imagecut = 1 diff --git a/number_parser.py b/number_parser.py index 6e7f7b4..025e2cf 100644 --- a/number_parser.py +++ b/number_parser.py @@ -41,6 +41,11 @@ def get_number(debug,filepath: str) -> str: file_number = re.search(r'\w+-\w+', filename, re.A).group() return file_number else: # 提取不含减号-的番号,FANZA CID + # 欧美番号匹配规则 + oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) + if oumei: + return oumei.group() + try: return str( re.findall(r'(.+?)\.', @@ -61,6 +66,11 @@ def get_number(debug,filepath: str) -> str: file_number = re.search(r'\w+-\w+', filename, re.A).group() return file_number else: # 提取不含减号-的番号,FANZA CID + # 欧美番号匹配规则 + oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) + if oumei: + return oumei.group() + try: return str( re.findall(r'(.+?)\.', @@ -72,4 +82,4 @@ def get_number(debug,filepath: str) -> str: # if __name__ == "__main__": # import doctest -# doctest.testmod(raise_on_error=True) \ No newline at end of file +# doctest.testmod(raise_on_error=True)