import sys sys.path.append('../') import re from lxml import etree import json from bs4 import BeautifulSoup from ADC_function import * # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) def getTitle(a): html = etree.fromstring(a, etree.HTMLParser()) result = html.xpath("/html/body/section/div/h2/strong/text()")[0] return result def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').replace('N/A', '').lstrip(',').replace(',', ', ') def getaphoto(url): html_page = get_html(url) img_prether = re.compile(r'演員\:\s*?.*?(.*)\s*?') actorall = actorall_prether.findall(html) if actorall: actoralls = actorall[0] actor_prether = re.compile(r'(.*?)') actor = actor_prether.findall(actoralls) actor_photo = {} for i in actor: actor_photo[i[1]] = getaphoto('https://javdb.com'+i[0]) return actor_photo else: return {} def getStudio(a): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") # return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') patherr = re.compile(r'片商\:[\s\S]*?(.*?)') pianshang = patherr.findall(a) if pianshang: result = pianshang[0] else: result = "" return result def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi') def getLabel(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getNum(a): html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') def getYear(getRelease): # try: # result = str(re.search('\d{4}', getRelease).group()) # return result # except: # return getRelease patherr = re.compile(r'日期\:\s*?.*?(.*?)\-.*?') dates = patherr.findall(getRelease) if dates: result = dates[0] else: result = '' return result def getRelease(a): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") # return str(result1 + result2).strip('+') patherr = re.compile(r'日期\:\s*?.*?(.*?)') dates = patherr.findall(a) if dates: result = dates[0] else: result = '' return result def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') total = [] for i in result: try: total.append(translateTag_to_sc(i)) except: pass return total except: result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') total = [] for i in result: try: total.append(translateTag_to_sc(i)) except: pass return total def getCover_small(a, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result except: # 2020.7.17 Repair Cover Url crawl try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] if not 'https' in result: result = 'https:' + result return result except: result = html.xpath("//div[@class='item-image']/img/@data-src")[index] if not 'https' in result: result = 'https:' + result return result def getTrailer(htmlcode): # 获取预告片 video_pather = re.compile(r'