#!/usr/bin/python3 # -*- coding: utf-8 -*- import sys sys.path.append('../') from urllib.parse import urlencode from ADC_function import * from WebCrawler.crawler import * class fanzaCrawler(Crawler): def getFanzaString(self,string): result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']") result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']") return result1+result2 def getFanzaStrings(self, string): result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()") if len(result1) > 0: return result1 result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()") return result2 def getRelease(fanza_Crawler): result = fanza_Crawler.getFanzaString('発売日:') if result == '----': result = fanza_Crawler.getFanzaString('配信開始日:') return result.replace("/", "-").strip('\\n') def getCover(html, number): cover_number = number try: result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] except: # sometimes fanza modify _ to \u0005f for image id if "_" in cover_number: cover_number = cover_number.replace("_", r"\u005f") try: result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] except: # (TODO) handle more edge case # print(html) # raise exception here, same behavior as before # people's major requirement is fetching the picture raise ValueError("can not find image") return result def getOutline(html): try: result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "") if result == "": result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "") except: # (TODO) handle more edge case # print(html) return "" return result def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'
\n') html = html_pather.search(htmlcode) if html: html = html.group() extrafanart_pather = re.compile(r'