From b88b2ead7ee8b005ab8e80088aa8822caa14c95e Mon Sep 17 00:00:00 2001 From: xingfan_xia Date: Thu, 3 Jun 2021 12:29:19 -0700 Subject: [PATCH] add airav outline to javdb crawler --- WebCrawler/javbus.py | 2 +- WebCrawler/javdb.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 50bef20..0082521 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -82,7 +82,7 @@ def getCID(htmlcode): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result -def getOutline(number): #获取演员 +def getOutline(number): #获取剧情介绍 try: response = json.loads(airav.main(number)) result = response['outline'] diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 1aadfca..a70e59a 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -192,10 +192,13 @@ def getDirector(a): result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") - return result +def getOutline(number): #获取剧情介绍 + try: + response = json.loads(airav.main(number)) + result = response['outline'] + return result + except: + return '' def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -281,7 +284,7 @@ def main(number): 'actor': getActor(detail_page), 'title': title, 'studio': getStudio(detail_page), - 'outline': getOutline(detail_page), + 'outline': getOutline(number), 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page),