import sys sys.path.append('../') import re from lxml import etree import json from bs4 import BeautifulSoup from ADC_function import * import secrets # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) def getTitle(a): html = etree.fromstring(a, etree.HTMLParser()) browser_title = str(html.xpath("/html/head/title/text()")[0]) return browser_title[:browser_title.find(' | JavDB')].strip() def getActor(a): html = etree.fromstring(a, etree.HTMLParser()) actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()') genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') r = [] idx = 0 actor_gendor = config.getInstance().actor_gender() if not actor_gendor in ['female','male','both','all']: actor_gendor = 'female' for act in actors: if((actor_gendor == 'all') or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male']) or (actor_gendor == 'female' and genders[idx] == 'symbol female') or (actor_gendor == 'male' and genders[idx] == 'symbol male')): r.append(act) idx = idx + 1 return r def getaphoto(url): html_page = get_html(url) img_prether = re.compile(r'演員\:\s*?.*?(.*)\s*?') actorall = actorall_prether.findall(html) if actorall: actoralls = actorall[0] actor_prether = re.compile(r'(.*?)') actor = actor_prether.findall(actoralls) actor_photo = {} for i in actor: actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0]) return actor_photo else: return {} def getStudio(a): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") # return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') patherr = re.compile(r'片商\:[\s\S]*?(.*?)') pianshang = patherr.findall(a) if pianshang: result = pianshang[0].strip() if len(result): return result # 以卖家作为工作室 html = etree.fromstring(a, etree.HTMLParser()) try: result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") except: result = '' return result def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi') def getLabel(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getNum(a): html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') def getYear(getRelease): # try: # result = str(re.search('\d{4}', getRelease).group()) # return result # except: # return getRelease patherr = re.compile(r'日期\:\s*?.*?(.*?)\-.*?') dates = patherr.findall(getRelease) if dates: result = dates[0] else: result = '' return result def getRelease(a): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") # return str(result1 + result2).strip('+') patherr = re.compile(r'日期\:\s*?.*?(.*?)') dates = patherr.findall(a) if dates: result = dates[0] else: result = '' return result def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') total = [] for i in result: try: total.append(translateTag_to_sc(i)) except: pass return total except: result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') total = [] for i in result: try: total.append(translateTag_to_sc(i)) except: pass return total def getCover_small(a, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result except: # 2020.7.17 Repair Cover Url crawl try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] if not 'https' in result: result = 'https:' + result return result except: result = html.xpath("//div[@class='item-image']/img/@data-src")[index] if not 'https' in result: result = 'https:' + result return result def getTrailer(htmlcode): # 获取预告片 video_pather = re.compile(r'