diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index eda8cb6..eac8d7a 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -26,10 +26,17 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img d.update(p) return d def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") +# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") +# return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') + patherr = re.compile(r'片商\:[\s\S]*?(.*?)') + pianshang = patherr.findall(a) + if pianshang: + result = pianshang[0] + else: + result = "" + return result def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") @@ -46,11 +53,18 @@ def getNum(a): result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') def getYear(getRelease): - try: - result = str(re.search('\d{4}', getRelease).group()) - return result - except: - return getRelease +# try: +# result = str(re.search('\d{4}', getRelease).group()) +# return result +# except: +# return getRelease + patherr = re.compile(r'日期\:\s*?.*?(.*?)\-.*?') + dates = patherr.findall(getRelease) + if dates: + result = dates[0] + else: + result = '' + return result def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")