diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index d467eab..7f9cf19 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -100,6 +100,12 @@ def get_data_from_json(file_number, oCC): sources.insert(0, sources.pop(sources.index("javdb"))) if "xcity" in sources: sources.insert(0, sources.pop(sources.index("xcity"))) + if "madou" in sources: + sources.insert(0, sources.pop(sources.index("madou"))) + elif "madou" in sources and ( + re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number) + ): + sources.insert(0, sources.pop(sources.index("madou"))) # check sources in func_mapping todel = [] diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index c559c8d..6885ce5 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -84,7 +84,7 @@ def getTrailer(htmlcode, number): def main(number): try: number = number.replace('FC2-', '').replace('fc2-', '') - htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/') + htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8') actor = getActor_fc2com(htmlcode2) if not actor: actor = '素人' @@ -123,4 +123,5 @@ def main(number): if __name__ == '__main__': print(main('FC2-1787685')) print(main('FC2-2086710')) + print(main('FC2-2182382')) diff --git a/WebCrawler/madou.py b/WebCrawler/madou.py index 01fc19c..fffd08b 100644 --- a/WebCrawler/madou.py +++ b/WebCrawler/madou.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') from bs4 import BeautifulSoup # need install from lxml import etree # need install from pyquery import PyQuery as pq # need install @@ -5,10 +7,8 @@ from ADC_function import * import json import re from lib2to3.pgen2 import parse -import sys from urllib.parse import urlparse, unquote -sys.path.append('../') def getActorPhoto(html): @@ -16,12 +16,10 @@ def getActorPhoto(html): def getTitle(html, number): # 获取标题 - title = str(html.xpath('//h1[@class="article-title"]/text()')[0]) - try: - result = str(re.split(r'[/|/|-]', title)[1]) - return result.strip() - except: - return title.replace(number.upper(), '').strip() + # MD0140-2 / 家有性事EP2 爱在身边-麻豆社 + # MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社 + browser_title = str(html.xpath("/html/head/title/text()")[0]) + return str(re.findall(r'^.*?( / | )(.*)-麻豆社$', browser_title)[0][1]).strip() def getStudio(html): # 获取厂商 已修改 @@ -83,13 +81,15 @@ def getSerise(html): # 获取系列 已修改 return '' -def getTag(html): # 获取标签 - return html.xpath('//div[@class="article-tags"]/a/text()') +def getTag(html, studio): # 获取标签 + x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i] def getExtrafanart(html): # 获取剧照 return '' + def cutTags(tags): actors = [] tags = [] @@ -109,13 +109,15 @@ def main(number): html = etree.fromstring(htmlcode, etree.HTMLParser()) url = getUrl(html) - tags = getTag(html) - actor,tags = cutTags(tags); + studio = getStudio(html) + tags = getTag(html, studio) + #actor,tags = cutTags(tags) # 演员在tags中的位置不固定,放弃尝试获取 + actor = '' dic = { # 标题 'title': getTitle(html, number), # 制作商 - 'studio': getStudio(html), + 'studio': studio, # 年份 'year': getYear(html), # 简介 @@ -161,4 +163,8 @@ def main(number): if __name__ == '__main__': - print(main('MD0094')) + print(main('MD0222')) + print(main('MD0140-2')) + print(main('MAD039')) + print(main('JDMY027')) + diff --git a/config.ini b/config.ini index e611762..740b050 100755 --- a/config.ini +++ b/config.ini @@ -41,7 +41,7 @@ max_title_len=50 update_check=1 [priority] -website=javbus,airav,fanza,xcity,javdb,mgstage,fc2,avsox,dlsite,carib,fc2club +website=javbus,airav,fanza,xcity,javdb,mgstage,fc2,avsox,dlsite,carib,fc2club,madou,mv91 [escape] literals=\()/ @@ -116,4 +116,4 @@ sites=33,34 ; 人脸识别 hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确,需要GPU/CUDA,速度慢) [face] -locations_model=hog \ No newline at end of file +locations_model=hog