增加欧美的刮削判断

2020-12-20 00:42:58 +08:00
parent c94fcd47fa
commit fc4cc4c122
1 changed files with 23 additions and 7 deletions
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -103,10 +103,17 @@ def getCover_small(a, index=0):
            result = 'https:' + result
        return result
    except: # 2020.7.17 Repair Cover Url crawl
-        result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
+        try:
-        if not 'https' in result:
+            result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
-            result = 'https:' + result
+            if not 'https' in result:
-        return result
+                result = 'https:' + result
            return result
        except:
            result = html.xpath("//div[@class='item-image']/img/@data-src")[index]
            if not 'https' in result:
                result = 'https:' + result
            return result
 def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    try:
@@ -141,14 +148,23 @@ def main(number):
        # and the first elememt maybe not the one we are looking for
        # iterate all candidates and find the match one
        urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
-        ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
+        # 记录一下欧美的ids  ['Blacked','Blacked']
-        correct_url = urls[ids.index(number)]
+        if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
            correct_url = urls[0]
        else:
            ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
            correct_url = urls[ids.index(number)]
        detail_page = get_html('https://javdb.com' + correct_url)
        # no cut image by default
        imagecut = 3
        # If gray image exists ,then replace with normal cover
-        cover_small = getCover_small(query_result, index=ids.index(number))
+                if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
            cover_small = getCover_small(query_result)
        else:
            cover_small = getCover_small(query_result, index=ids.index(number))
        if 'placeholder' in cover_small:
            # replace wit normal cover and cut it
            imagecut = 1