diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index d467eab..7f9cf19 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -100,6 +100,12 @@ def get_data_from_json(file_number, oCC): sources.insert(0, sources.pop(sources.index("javdb"))) if "xcity" in sources: sources.insert(0, sources.pop(sources.index("xcity"))) + if "madou" in sources: + sources.insert(0, sources.pop(sources.index("madou"))) + elif "madou" in sources and ( + re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number) + ): + sources.insert(0, sources.pop(sources.index("madou"))) # check sources in func_mapping todel = [] diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index c559c8d..6885ce5 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -84,7 +84,7 @@ def getTrailer(htmlcode, number): def main(number): try: number = number.replace('FC2-', '').replace('fc2-', '') - htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/') + htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8') actor = getActor_fc2com(htmlcode2) if not actor: actor = '素人' @@ -123,4 +123,5 @@ def main(number): if __name__ == '__main__': print(main('FC2-1787685')) print(main('FC2-2086710')) + print(main('FC2-2182382')) diff --git a/WebCrawler/madou.py b/WebCrawler/madou.py index 01fc19c..fffd08b 100644 --- a/WebCrawler/madou.py +++ b/WebCrawler/madou.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') from bs4 import BeautifulSoup # need install from lxml import etree # need install from pyquery import PyQuery as pq # need install @@ -5,10 +7,8 @@ from ADC_function import * import json import re from lib2to3.pgen2 import parse -import sys from urllib.parse import urlparse, unquote -sys.path.append('../') def getActorPhoto(html): @@ -16,12 +16,10 @@ def getActorPhoto(html): def getTitle(html, number): # 获取标题 - title = str(html.xpath('//h1[@class="article-title"]/text()')[0]) - try: - result = str(re.split(r'[/|/|-]', title)[1]) - return result.strip() - except: - return title.replace(number.upper(), '').strip() + #