diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index b463b3d..0a7de83 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -18,18 +18,38 @@ def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1] result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').replace('N/A', '').lstrip(',').replace(',', ', ') -def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img - a = actor.split(',') - d={} - for i in a: - p={i:''} - d.update(p) - return d + +def getaphoto(url): + html_page = get_html(url) + img_prether = re.compile(r'演員\:\s*?.*?(.*)\s*?') + actorall = actorall_prether.findall(html) + + if actorall: + actoralls = actorall[0] + actor_prether = re.compile(r'(.*?)') + actor = actor_prether.findall(actoralls) + actor_photo = {} + for i in actor: + actor_photo[i[1]] = getaphoto('https://javdb.com'+i[0]) + + return actor_photo + + else: + return {} + def getStudio(a): -# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() -# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") -# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") -# return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') + # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") + # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") + # return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') patherr = re.compile(r'片商\:[\s\S]*?(.*?)') pianshang = patherr.findall(a) if pianshang: @@ -37,6 +57,7 @@ def getStudio(a): else: result = "" return result + def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") @@ -53,11 +74,11 @@ def getNum(a): result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') def getYear(getRelease): -# try: -# result = str(re.search('\d{4}', getRelease).group()) -# return result -# except: -# return getRelease + # try: + # result = str(re.search('\d{4}', getRelease).group()) + # return result + # except: + # return getRelease patherr = re.compile(r'日期\:\s*?.*?(.*?)\-.*?') dates = patherr.findall(getRelease) if dates: @@ -65,11 +86,12 @@ def getYear(getRelease): else: result = '' return result + def getRelease(a): -# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() -# result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") -# result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") -# return str(result1 + result2).strip('+') + # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + # result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") + # result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") + # return str(result1 + result2).strip('+') patherr = re.compile(r'日期\:\s*?.*?(.*?)') dates = patherr.findall(a) if dates: @@ -121,6 +143,30 @@ def getCover_small(a, index=0): result = 'https:' + result return result + +def getTrailer(htmlcode): # 获取预告片 + video_pather = re.compile(r'