Merge pull request #720 from lededev/md-1

madou priority against javdb
2022-03-15 16:55:53 +08:00
parent c45037e20c b6786ef9d7
commit 1cb4cd37a2
4 changed files with 30 additions and 17 deletions
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
@@ -100,6 +100,12 @@ def get_data_from_json(file_number, oCC):
                sources.insert(0, sources.pop(sources.index("javdb")))
            if "xcity" in sources:
                sources.insert(0, sources.pop(sources.index("xcity")))
+            if "madou" in sources:
+                sources.insert(0, sources.pop(sources.index("madou")))
+        elif "madou" in sources and (
+                re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
+        ):
+            sources.insert(0, sources.pop(sources.index("madou")))

    # check sources in func_mapping
    todel = []
--- a/WebCrawler/fc2.py
+++ b/WebCrawler/fc2.py
@@ -84,7 +84,7 @@ def getTrailer(htmlcode, number):
 def main(number):
    try:
        number = number.replace('FC2-', '').replace('fc2-', '')
-        htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/')
+        htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
        actor = getActor_fc2com(htmlcode2)
        if not actor:
            actor = '素人'
@@ -123,4 +123,5 @@ def main(number):
 if __name__ == '__main__':
    print(main('FC2-1787685'))
    print(main('FC2-2086710'))
+    print(main('FC2-2182382'))

--- a/WebCrawler/madou.py
+++ b/WebCrawler/madou.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 from bs4 import BeautifulSoup  # need install
 from lxml import etree  # need install
 from pyquery import PyQuery as pq  # need install
@@ -5,10 +7,8 @@ from ADC_function import *
 import json
 import re
 from lib2to3.pgen2 import parse
-import sys

 from urllib.parse import urlparse, unquote
-sys.path.append('../')


 def getActorPhoto(html):
@@ -16,12 +16,10 @@ def getActorPhoto(html):


 def getTitle(html, number):  # 获取标题
-    title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
-    try:
-        result = str(re.split(r'[/|／|-]', title)[1])
-        return result.strip()
-    except:
-        return title.replace(number.upper(), '').strip()
+    # <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
+    # <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
+    browser_title = str(html.xpath("/html/head/title/text()")[0])
+    return str(re.findall(r'^.*?( / | )(.*)-麻豆社$', browser_title)[0][1]).strip()


 def getStudio(html):  # 获取厂商 已修改
@@ -83,13 +81,15 @@ def getSerise(html):  # 获取系列 已修改
    return ''


-def getTag(html):  # 获取标签
-    return html.xpath('//div[@class="article-tags"]/a/text()')
+def getTag(html, studio):  # 获取标签
+    x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+    return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]


 def getExtrafanart(html):  # 获取剧照
    return ''

+
 def cutTags(tags):
    actors = []
    tags = []
@@ -109,13 +109,15 @@ def main(number):

        html = etree.fromstring(htmlcode, etree.HTMLParser())
        url = getUrl(html)
-        tags = getTag(html)
-        actor,tags = cutTags(tags);
+        studio = getStudio(html)
+        tags = getTag(html, studio)
+        #actor,tags = cutTags(tags) # 演员在tags中的位置不固定，放弃尝试获取
+        actor = ''
        dic = {
            # 标题
            'title': getTitle(html, number),
            # 制作商
-            'studio': getStudio(html),
+            'studio': studio,
            # 年份
            'year': getYear(html),
            # 简介
@@ -161,4 +163,8 @@ def main(number):


 if __name__ == '__main__':
-    print(main('MD0094'))
+    print(main('MD0222'))
+    print(main('MD0140-2'))
+    print(main('MAD039'))
+    print(main('JDMY027'))
+
--- a/config.ini
+++ b/config.ini
@@ -41,7 +41,7 @@ max_title_len=50
 update_check=1

 [priority]
-website=javbus,airav,fanza,xcity,javdb,mgstage,fc2,avsox,dlsite,carib,fc2club
+website=javbus,airav,fanza,xcity,javdb,mgstage,fc2,avsox,dlsite,carib,fc2club,madou,mv91

 [escape]
 literals=\()/