import logging import sys sys.path.append('../') import re from lxml import etree import json from ADC_function import * from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) def getTitle(html): browser_title = str(html.xpath("/html/head/title/text()")[0]) return browser_title[:browser_title.find(' | JavDB')].strip() def getActor(html): actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()') genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') r = [] idx = 0 actor_gendor = config.getInstance().actor_gender() if not actor_gendor in ['female','male','both','all']: actor_gendor = 'female' for act in actors: if((actor_gendor == 'all') or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male']) or (actor_gendor == 'female' and genders[idx] == 'symbol female') or (actor_gendor == 'male' and genders[idx] == 'symbol male')): r.append(act) idx = idx + 1 return r def getaphoto(url, session): html_page = session.get(url).text if session is not None else get_html(url) img_prether = re.compile(r'片商\:[\s\S]*?(.*?)') pianshang = patherr.findall(a) if pianshang: result = pianshang[0].strip() if len(result): return result # 以卖家作为工作室 try: result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") except: result = '' return result def getRuntime(html): result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi') def getLabel(html): result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getNum(html): result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') def getYear(getRelease): # try: # result = str(re.search('\d{4}', getRelease).group()) # return result # except: # return getRelease patherr = re.compile(r'日期\:\s*?.*?(.*?)\-.*?') dates = patherr.findall(getRelease) if dates: result = dates[0] else: result = '' return result def getRelease(a): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") # return str(result1 + result2).strip('+') patherr = re.compile(r'日期\:\s*?.*?(.*?)') dates = patherr.findall(a) if dates: result = dates[0] else: result = '' return result def getTag(html): try: result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') return result except: result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') return result def getCover_small(html, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result except: # 2020.7.17 Repair Cover Url crawl try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] if not 'https' in result: result = 'https:' + result return result except: result = html.xpath("//div[@class='item-image']/img/@data-src")[index] if not 'https' in result: result = 'https:' + result return result def getTrailer(htmlcode): # 获取预告片 video_pather = re.compile(r'