diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index 3699aa4..8fb4860 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -5,6 +5,7 @@ from lxml import etree import json from ADC_function import * from WebCrawler.storyline import getStoryline +from crawler import * # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) @@ -17,95 +18,64 @@ def getActorPhoto(html): p2 = {t: l} d.update(p2) return d -def getTitle(html): - try: - result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] - return result.replace('/', '') - except: - return '' + def getActor(html): a = html.xpath('//a[@class="avatar-box"]') d = [] for i in a: d.append(i.find('span').text) return d -def getStudio(html): - result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') - return result1 -def getRuntime(html): - result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") - return result1 -def getLabel(html): - result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") - return result1 -def getNum(html): - result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") - return result1 -def getYear(release): - try: - result = str(re.search('\d{4}',release).group()) - return result - except: - return release -def getRelease(html): - result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") - return result1 -def getCover(html): - result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") - return result + def getCover_small(html): result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") return result def getTag(html): x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') return [i.strip() for i in x[2:]] if len(x) > 2 else [] -def getSeries(html): - try: - result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") - return result1 - except: - return '' def main(number): html = get_html('https://tellme.pw/avsox') - site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0] + site = Crawler(html).getString('//div[@class="container"]/div/a/@href') a = get_html(site + '/cn/search/' + number) - html = etree.fromstring(a, etree.HTMLParser()) - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + html = Crawler(a) + result1 = html.getString('//*[@id="waterfall"]/div/a/@href') if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html(site + '/cn/search/' + number.replace('-', '_')) - html = etree.fromstring(a, etree.HTMLParser()) - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + html = Crawler(a) + result1 = html.getString('//*[@id="waterfall"]/div/a/@href') if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html(site + '/cn/search/' + number.replace('_', '')) - html = etree.fromstring(a, etree.HTMLParser()) - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + html = Crawler(a) + result1 = html.getString('//*[@id="waterfall"]/div/a/@href') detail = get_html("https:" + result1) lx = etree.fromstring(detail, etree.HTMLParser()) + avsox_crawler2 = Crawler(a) + avsox_crawler = Crawler(detail) try: - new_number = getNum(lx) + new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()') if new_number.upper() != number.upper(): raise ValueError('number not found') - title = getTitle(lx).strip(new_number) + title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number) dic = { 'actor': getActor(lx), 'title': title, - 'studio': getStudio(lx), + 'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '), 'outline': getStoryline(number, title), - 'runtime': getRuntime(lx), + 'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''), 'director': '', # - 'release': getRelease(lx), + 'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'), 'number': new_number, - 'cover': getCover(lx), - 'cover_small': getCover_small(html), + 'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'), + #'cover_small' : getCover_small(html), + 'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'), 'imagecut': 3, 'tag': getTag(lx), - 'label': getLabel(lx), - 'year': getYear(getRelease(lx)), + 'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'), + 'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0], 'actor_photo': getActorPhoto(lx), 'website': "https:" + result1, 'source': 'avsox.py', - 'series': getSeries(lx), + 'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'), } except Exception as e: if config.getInstance().debug(): diff --git a/WebCrawler/crawler.py b/WebCrawler/crawler.py new file mode 100644 index 0000000..e6176b6 --- /dev/null +++ b/WebCrawler/crawler.py @@ -0,0 +1,28 @@ +from lxml import etree + +class Crawler: + def __init__(self,htmlcode): + self.html = etree.HTML(htmlcode) + + def getString(self,_xpath): + if _xpath == "": + return "" + result = self.html.xpath(_xpath) + try: + return result[0] + except: + return "" + + def getStrings(self,_xpath): + result = self.html.xpath(_xpath) + try: + return result + except: + return "" + + def getOutline(self,_xpath): + result = self.html.xpath(_xpath) + try: + return "\n".join(result) + except: + return "" \ No newline at end of file diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index 867f3b9..b0d387a 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -1,37 +1,11 @@ import sys sys.path.append('../') +from crawler import * from ADC_function import * from lxml import etree from requests_html import HTMLSession -class Crawler: - def __init__(self,htmlcode): - self.html = etree.HTML(htmlcode) - - def getString(self,_xpath): - if _xpath == "": - return "" - result = self.html.xpath(_xpath) - try: - return result[0] - except: - return "" - - def getStrings(self,_xpath): - result = self.html.xpath(_xpath) - try: - return result - except: - return "" - - def getOutline(self,_xpath): - result = self.html.xpath(_xpath) - try: - return "\n".join(result) - except: - return "" - def main(number): config_file = config.getInstance() browser = HTMLSession()