diff --git a/.gitignore b/.gitignore index 95c3b73..09632a9 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,5 @@ venv.bak/ JAV_output/**/* failed/* .vscode/launch.json + +.idea \ No newline at end of file diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py index 4594987..622f837 100644 --- a/WebCrawler/fanza.py +++ b/WebCrawler/fanza.py @@ -67,7 +67,7 @@ def getOutline(html): def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'
') + html_pather = re.compile(r'
\n') html = html_pather.search(htmlcode) if html: html = html.group() diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index e4e2eb9..4eb95cb 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -124,7 +124,7 @@ def getCover_small(html, index=0): # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number try: - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] + result = html.xpath("//*[@class='movie-list h cols-4']/div/a/div[contains(@class, 'cover')]/img/@src")[index] if not 'https' in result: result = 'https:' + result return result @@ -242,12 +242,12 @@ def main(number): # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one - urls = html.xpath('//div[@class="item"]/a[@class="box"]/@href') + urls = html.xpath('//*[@class="movie-list h cols-4"]/div/a/@href') # 记录一下欧美的ids ['Blacked','Blacked'] if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): correct_url = urls[0] else: - ids = html.xpath('//div[@class="item"]/a[@class="box"]/div[@class="video-title"]/strong/text()') + ids = html.xpath('//*[@class="movie-list h cols-4"]/div/a/div[contains(@class, "video-title")]/strong/text()') try: correct_url = urls[ids.index(number)] except: