From 7efc96f27d5344905558b0fb01d62093bad725b3 Mon Sep 17 00:00:00 2001 From: Feng4 Date: Sat, 19 Dec 2020 20:40:41 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=8F=91=E5=B8=83=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E6=8A=93=E5=8F=96=E4=B8=8D=E5=88=B0=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javdb.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index eda8cb6..eac8d7a 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -26,10 +26,17 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img d.update(p) return d def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") +# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") +# return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') + patherr = re.compile(r'片商\:[\s\S]*?(.*?)') + pianshang = patherr.findall(a) + if pianshang: + result = pianshang[0] + else: + result = "" + return result def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") @@ -46,11 +53,18 @@ def getNum(a): result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') def getYear(getRelease): - try: - result = str(re.search('\d{4}', getRelease).group()) - return result - except: - return getRelease +# try: +# result = str(re.search('\d{4}', getRelease).group()) +# return result +# except: +# return getRelease + patherr = re.compile(r'日期\:\s*?.*?(.*?)\-.*?') + dates = patherr.findall(getRelease) + if dates: + result = dates[0] + else: + result = '' + return result def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")