diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py index f09ad1e..344bd45 100644 --- a/WebCrawler/fanza.py +++ b/WebCrawler/fanza.py @@ -9,130 +9,33 @@ from urllib.parse import urlencode from lxml import etree from ADC_function import * - +from crawler import * # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) +class fanzaCrawler(Crawler): + def getFanzaString(self,string): + result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']") + result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']") + return result1+result2 -def getTitle(text): - html = etree.fromstring(text, etree.HTMLParser()) - result = html.xpath('//*[starts-with(@id, "title")]/text()')[0] - return result + def getFanzaStrings(self, string): + result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()") + if len(result1) > 0: + return result1 + result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()") + return result2 -def getActor(text): - # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(text, etree.HTMLParser()) - result = ( - str( - html.xpath( - "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" - ) - ) - .strip(" ['']") - .replace("', '", ",") - ) - return result +def getRelease(fanza_Crawler): + result = fanza_Crawler.getFanzaString('発売日:') + if result == '----': + result = fanza_Crawler.getFanzaString('配信開始日:') + return result.replace("/", "-").strip('\\n') -def getStudio(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'メーカー')]/following-sibling::td/text()" - )[0] - return result - - -def getRuntime(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0] - return re.search(r"\d+", str(result)).group() - - -def getLabel(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'レーベル:')]/following-sibling::td/text()" - )[0] - return result - - -def getNum(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'品番:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'品番:')]/following-sibling::td/text()" - )[0] - return result - - -def getYear(getRelease): - try: - result = str(re.search(r"\d{4}", getRelease).group()) - return result - except: - return getRelease - - -def getRelease(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" - )[0].lstrip("\n") - except: - try: - result = html.xpath( - "//td[contains(text(),'発売日:')]/following-sibling::td/text()" - )[0].lstrip("\n") - except: - result = "----" - if result == "----": - try: - result = html.xpath( - "//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()" - )[0].lstrip("\n") - except: - try: - result = html.xpath( - "//td[contains(text(),'配信開始日:')]/following-sibling::td/text()" - )[0].lstrip("\n") - except: - pass - return result.replace("/", "-") - - -def getTag(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" - ) - return result - except: - result = html.xpath( - "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" - ) - return result - - -def getCover(text, number): - html = etree.fromstring(text, etree.HTMLParser()) +def getCover(html, number): cover_number = number try: result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] @@ -151,29 +54,11 @@ def getCover(text, number): return result -def getDirector(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getOutline(html): try: - result = html.xpath( - "//td[contains(text(),'監督:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'監督:')]/following-sibling::td/text()" - )[0] - return result - - -def getOutline(text): - html = etree.fromstring(text, etree.HTMLParser()) - try: - result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace( - "\n", "" - ) + result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "") if result == "": - result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( - "\n", "" - ) + result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "") except: # (TODO) handle more edge case # print(html) @@ -181,21 +66,6 @@ def getOutline(text): return result -def getSeries(text): - try: - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" - )[0] - return result - except: - return "" - def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'
') html = html_pather.search(htmlcode) @@ -232,6 +102,7 @@ def main(number): "https://www.dmm.co.jp/rental/-/detail/=/cid=", ] chosen_url = "" + fanza_Crawler = '' for url in fanza_urls: chosen_url = url + fanza_search_number @@ -240,6 +111,7 @@ def main(number): urlencode({"rurl": chosen_url}) ) ) + fanza_Crawler = fanzaCrawler(htmlcode) if "404 Not Found" not in htmlcode: break if "404 Not Found" in htmlcode: @@ -249,35 +121,34 @@ def main(number): # for example, the url will be cid=test012 # but the hinban on the page is test00012 # so get the hinban first, and then pass it to following functions - fanza_hinban = getNum(htmlcode) + fanza_hinban = fanza_Crawler.getFanzaString('品番:') out_num = fanza_hinban number_lo = number.lower() + html = etree.fromstring(htmlcode, etree.HTMLParser()) if (re.sub('-|_', '', number_lo) == fanza_hinban or number_lo.replace('-', '00') == fanza_hinban or number_lo.replace('-', '') + 'so' == fanza_hinban ): out_num = number data = { - "title": getTitle(htmlcode).strip(), - "studio": getStudio(htmlcode), - "outline": getOutline(htmlcode), - "runtime": getRuntime(htmlcode), - "director": getDirector(htmlcode) if "anime" not in chosen_url else "", - "actor": getActor(htmlcode) if "anime" not in chosen_url else "", - "release": getRelease(htmlcode), + "title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(), + "studio": fanza_Crawler.getFanzaString('メーカー'), + "outline": getOutline(html), + "runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"), + "director": fanza_Crawler.getFanzaString('監督:') if "anime" not in chosen_url else "", + "actor": fanza_Crawler.getString("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()").replace("', '", ",") if "anime" not in chosen_url else "", + "release": getRelease(fanza_Crawler), "number": out_num, - "cover": getCover(htmlcode, fanza_hinban), + "cover": getCover(html, fanza_hinban), "imagecut": 1, - "tag": getTag(htmlcode), + "tag": fanza_Crawler.getFanzaStrings('ジャンル:'), "extrafanart": getExtrafanart(htmlcode), - "label": getLabel(htmlcode), - "year": getYear( - getRelease(htmlcode) - ), # str(re.search('\d{4}',getRelease(a)).group()), + "label": fanza_Crawler.getFanzaString('レーベル'), + "year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()), "actor_photo": "", "website": chosen_url, "source": "fanza.py", - "series": getSeries(htmlcode), + "series": fanza_Crawler.getFanzaString('シリーズ:'), } except: data = { diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index 6885ce5..d5340fa 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -4,58 +4,11 @@ import re from lxml import etree#need install import json import ADC_function +from crawler import * # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getTitle_fc2com(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0] - return result -def getActor_fc2com(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] - return result - except: - return '' -def getStudio_fc2com(htmlcode): #获取厂商 - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']") - return result - except: - return '' -def getNum_fc2com(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getRelease_fc2com(htmlcode2): # - html=etree.fromstring(htmlcode2,etree.HTMLParser()) - result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()')).strip(" ['販売日 : ']").replace('/','-') - return result -def getCover_fc2com(htmlcode2): #获取厂商 # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']") - return 'http:' + result -# def getOutline_fc2com(htmlcode2): #获取番号 # -# xpath_html = etree.fromstring(htmlcode2, etree.HTMLParser()) -# path = str(xpath_html.xpath('//*[@id="top"]/div[1]/section[4]/iframe/@src')).strip(" ['']") -# html = etree.fromstring(ADC_function.get_html('https://adult.contents.fc2.com/'+path), etree.HTMLParser()) -# print('https://adult.contents.fc2.com'+path) -# print(ADC_function.get_html('https://adult.contents.fc2.com'+path,cookies={'wei6H':'1'})) -# result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') -# return result -def getTag_fc2com(lx): - result = lx.xpath("//a[@class='tag tagTag']/text()") - return result -def getYear_fc2com(release): - try: - result = re.search('\d{4}',release).group() - return result - except: - return '' - def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'