Merge branch 'upstream'

# Conflicts: # WebCrawler/fanza.py
2022-04-29 23:53:21 +08:00
parent edfddc18d8 1655d5ff3e
commit 5e42eb8236
34 changed files with 25968 additions and 18336 deletions
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
@@ -24,6 +24,7 @@ from . import carib
 from . import fc2club
 from . import mv91
 from . import madou
+from . import gcolle


 def get_data_state(data: dict) -> bool:  # 元数据获取失败检测
@@ -62,7 +63,8 @@ def get_data_from_json(file_number, oCC):
        "carib": carib.main,
        "fc2club": fc2club.main,
        "mv91": mv91.main,
-        "madou": madou.main
+        "madou": madou.main,
+        "gcolle": gcolle.main,
    }

    conf = config.getInstance()
@@ -91,6 +93,8 @@ def get_data_from_json(file_number, oCC):
                sources.insert(0, sources.pop(sources.index("fc2")))
            if "fc2club" in sources:
                sources.insert(0, sources.pop(sources.index("fc2club")))
+        elif "gcolle" in sources and (re.search("\d{6}", file_number)):
+            sources.insert(0, sources.pop(sources.index("gcolle")))
        elif "dlsite" in sources and (
                "rj" in lo_file_number or "vj" in lo_file_number
        ):
@@ -100,6 +104,12 @@ def get_data_from_json(file_number, oCC):
                sources.insert(0, sources.pop(sources.index("javdb")))
            if "xcity" in sources:
                sources.insert(0, sources.pop(sources.index("xcity")))
+            if "madou" in sources:
+                sources.insert(0, sources.pop(sources.index("madou")))
+        elif "madou" in sources and (
+                re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
+        ):
+            sources.insert(0, sources.pop(sources.index("madou")))

    # check sources in func_mapping
    todel = []
@@ -124,7 +134,10 @@ def get_data_from_json(file_number, oCC):
        for source in sources:
            if conf.debug() == True:
                print('[+]select', source)
-            json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
+            try:
+                json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
+            except:
+                json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
            # if any service return a valid return, break
            if get_data_state(json_data):
                print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -136,7 +149,10 @@ def get_data_from_json(file_number, oCC):
            try:
                if conf.debug() == True:
                    print('[+]select', source)
-                json_data = json.loads(func_mapping[source](file_number))
+                try:
+                    json_data = json.loads(func_mapping[source](file_number))
+                except:
+                    json_data = func_mapping[source](file_number)
                # if any service return a valid return, break
                if get_data_state(json_data):
                    print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -242,8 +258,8 @@ def get_data_from_json(file_number, oCC):
            if json_data[translate_value] == "":
                continue
            if translate_value == "title":
-                title_dict = json.load(
-                    open(str(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json'), 'r', encoding="utf-8"))
+                title_dict = json.loads(
+                    (Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
                try:
                    json_data[translate_value] = title_dict[number]
                    continue
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -5,6 +5,7 @@ from lxml import etree
 import json
 from ADC_function import *
 from WebCrawler.storyline import getStoryline
+from WebCrawler.crawler import *
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)

@@ -17,95 +18,64 @@ def getActorPhoto(html):
        p2 = {t: l}
        d.update(p2)
    return d
-def getTitle(html):
-    try:
-        result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
-        return result.replace('/', '')
-    except:
-        return ''
+
 def getActor(html):
    a = html.xpath('//a[@class="avatar-box"]')
    d = []
    for i in a:
        d.append(i.find('span').text)
    return d
-def getStudio(html):
-    result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
-    return result1
-def getRuntime(html):
-    result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
-    return result1
-def getLabel(html):
-    result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
-    return result1
-def getNum(html):
-    result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
-    return result1
-def getYear(release):
-    try:
-        result = str(re.search('\d{4}',release).group())
-        return result
-    except:
-        return release
-def getRelease(html):
-    result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
-    return result1
-def getCover(html):
-    result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
-    return result
+
 def getCover_small(html):
    result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
    return result
 def getTag(html):
    x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
    return [i.strip() for i in x[2:]]  if len(x) > 2 else []
-def getSeries(html):
-    try:
-        result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
-        return result1
-    except:
-        return ''

 def main(number):
    html = get_html('https://tellme.pw/avsox')
-    site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
+    site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
    a = get_html(site + '/cn/search/' + number)
-    html = etree.fromstring(a, etree.HTMLParser())
-    result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
+    html = Crawler(a)
+    result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
    if result1 == '' or result1 == 'null' or result1 == 'None':
        a = get_html(site + '/cn/search/' + number.replace('-', '_'))
-        html = etree.fromstring(a, etree.HTMLParser())
-        result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
+        html = Crawler(a)
+        result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
        if result1 == '' or result1 == 'null' or result1 == 'None':
            a = get_html(site + '/cn/search/' + number.replace('_', ''))
-            html = etree.fromstring(a, etree.HTMLParser())
-            result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
+            html = Crawler(a)
+            result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
    detail = get_html("https:" + result1)
    lx = etree.fromstring(detail, etree.HTMLParser())
+    avsox_crawler2 = Crawler(a)
+    avsox_crawler = Crawler(detail)
    try:
-        new_number = getNum(lx)
+        new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
        if new_number.upper() != number.upper():
            raise ValueError('number not found')
-        title = getTitle(lx).strip(new_number)
+        title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
        dic = {
            'actor': getActor(lx),
            'title': title,
-            'studio': getStudio(lx),
+            'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
            'outline': getStoryline(number, title),
-            'runtime': getRuntime(lx),
+            'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
            'director': '',  #
-            'release': getRelease(lx),
+            'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
            'number': new_number,
-            'cover': getCover(lx),
-            'cover_small': getCover_small(html),
+            'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
+            #'cover_small' : getCover_small(html),
+            'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
            'imagecut': 3,
            'tag': getTag(lx),
-            'label': getLabel(lx),
-            'year': getYear(getRelease(lx)),
+            'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
+            'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
            'actor_photo': getActorPhoto(lx),
            'website': "https:" + result1,
            'source': 'avsox.py',
-            'series': getSeries(lx),
+            'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
        }
    except Exception as e:
        if config.getInstance().debug():
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -40,6 +40,7 @@ def main(number: str) -> json:
            'website': f'{G_SITE}/moviepages/{number}/index.html',
            'source': 'carib.py',
            'series': get_series(lx),
+            '无码': True
        }
        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
        return js
@@ -59,7 +60,7 @@ def get_year(lx: html.HtmlElement) -> str:

 def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
    o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
-    g = getStoryline(number, title)
+    g = getStoryline(number, title, 无码=True)
    if len(g):
        return g
    return o
--- a/WebCrawler/crawler.py
+++ b/WebCrawler/crawler.py
@@ -0,0 +1,28 @@
+from lxml import etree
+
+class Crawler:
+    def __init__(self,htmlcode):
+        self.html = etree.HTML(htmlcode)
+
+    def getString(self,_xpath):
+        if _xpath == "":
+            return ""
+        result = self.html.xpath(_xpath)
+        try:
+            return result[0]
+        except:
+            return ""
+
+    def getStrings(self,_xpath):
+        result = self.html.xpath(_xpath)
+        try:
+            return result
+        except:
+            return ""
+
+    def getOutline(self,_xpath):
+        result = self.html.xpath(_xpath)
+        try:
+            return "\n".join(result)
+        except:
+            return ""
--- a/WebCrawler/dlsite.py
+++ b/WebCrawler/dlsite.py
@@ -1,15 +1,14 @@
 import re
 from lxml import etree
 import json
-from bs4 import BeautifulSoup
 import sys
 sys.path.append('../')
 from ADC_function import *
 # import sys
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
-#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html'))
-#title //*[@id="work_name"]/a/text()
+#print(get_html('https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html'))
+#title /html/head/title/text()
 #studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
 #release //th[contains(text(),"販売日")]/../td/a/text()
 #story //th[contains(text(),"シナリオ")]/../td/a/text()
@@ -18,14 +17,14 @@ from ADC_function import *
 #jianjie //*[@id="main_inner"]/div[3]/text()
 #photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src

-#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html
+#https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html

-def getTitle(a):
-    html = etree.fromstring(a, etree.HTMLParser())
-    result = html.xpath('//*[@id="work_name"]/a/text()')[0]
+def getTitle(html):
+    result = str(html.xpath('/html/head/title/text()')[0])
+    result = result[:result.rfind(' | DLsite')]
+    result = result[:result.rfind(' [')]
    return result
-def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getActor(html):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
    try:
        result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
    except:
@@ -38,8 +37,7 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
        p={i:''}
        d.update(p)
    return d
-def getStudio(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getStudio(html):
    try:
        try:
            result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -53,8 +51,7 @@ def getRuntime(a):
    result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').rstrip('mi')
-def getLabel(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getLabel(html):
    try:
        try:
            result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -69,12 +66,10 @@ def getYear(getRelease):
        return result
    except:
        return getRelease
-def getRelease(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getRelease(html):
    result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
    return result1.replace('年','-').replace('月','-').replace('日','')
-def getTag(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getTag(html):
    try:
        result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
        return result
@@ -96,26 +91,22 @@ def getCover_small(a, index=0):
        if not 'https' in result:
            result = 'https:' + result
        return result
-def getCover(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0]
-    return result
-def getDirector(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getCover(html):
+    result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
+    return result.replace('.webp', '.jpg')
+def getDirector(html):
    try:
        result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
    except:
        result = ''
    return result
-def getOutline(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getOutline(html):
    total = []
-    result = html.xpath('//*[@id="main_inner"]/div[3]/text()')
+    result = html.xpath('//*[@class="work_parts_area"]/p/text()')
    for i in result:
        total.append(i.strip('\r\n'))
    return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
-def getSeries(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getSeries(html):
    try:
        try:
            result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -127,28 +118,28 @@ def getSeries(a):
 def main(number):
    try:
        number = number.upper()
-        htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
+        htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN',
                            cookies={'locale': 'zh-cn'})
-
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
        dic = {
-            'actor': getActor(htmlcode),
-            'title': getTitle(htmlcode),
-            'studio': getStudio(htmlcode),
-            'outline': getOutline(htmlcode),
+            'actor': getActor(html),
+            'title': getTitle(html),
+            'studio': getStudio(html),
+            'outline': getOutline(html),
            'runtime': '',
-            'director': getDirector(htmlcode),
-            'release': getRelease(htmlcode),
+            'director': getDirector(html),
+            'release': getRelease(html),
            'number': number,
-            'cover': 'https:' + getCover(htmlcode),
+            'cover': 'https:' + getCover(html),
            'cover_small': '',
            'imagecut': 0,
-            'tag': getTag(htmlcode),
-            'label': getLabel(htmlcode),
-            'year': getYear(getRelease(htmlcode)),  # str(re.search('\d{4}',getRelease(a)).group()),
+            'tag': getTag(html),
+            'label': getLabel(html),
+            'year': getYear(getRelease(html)),  # str(re.search('\d{4}',getRelease(a)).group()),
            'actor_photo': '',
-            'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
+            'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
            'source': 'dlsite.py',
-            'series': getSeries(htmlcode),
+            'series': getSeries(html),
        }
        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
        return js
@@ -166,4 +157,6 @@ def main(number):
 # main('DV-1562')
 # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
 if __name__ == "__main__":
+    config.getInstance().set_override("debug_mode:switch=1")
    print(main('VJ013178'))
+    print(main('RJ329607'))
--- a/WebCrawler/fanza.py
+++ b/WebCrawler/fanza.py
@@ -9,130 +9,33 @@ from urllib.parse import urlencode
 from lxml import etree

 from ADC_function import *
-
+from WebCrawler.crawler import *
 # import sys
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)

+class fanzaCrawler(Crawler):
+    def getFanzaString(self,string):
+        result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
+        result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
+        return result1+result2

-def getTitle(text):
-    html = etree.fromstring(text, etree.HTMLParser())
-    result = html.xpath('//*[starts-with(@id, "title")]/text()')[0]
-    return result
+    def getFanzaStrings(self, string):
+        result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
+        if len(result1) > 0:
+            return result1
+        result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
+        return result2


-def getActor(text):
-    # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
-    html = etree.fromstring(text, etree.HTMLParser())
-    result = (
-        str(
-            html.xpath(
-                "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
-            )
-        )
-        .strip(" ['']")
-        .replace("', '", ",")
-    )
-    return result
+def getRelease(fanza_Crawler):
+    result = fanza_Crawler.getFanzaString('発売日：')
+    if result == '----':
+        result = fanza_Crawler.getFanzaString('配信開始日：')
+    return result.replace("/", "-").strip('\\n')


-def getStudio(text):
-    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    try:
-        result = html.xpath(
-            "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
-        )[0]
-    except:
-        result = html.xpath(
-            "//td[contains(text(),'メーカー')]/following-sibling::td/text()"
-        )[0]
-    return result
-
-
-def getRuntime(text):
-    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
-    return re.search(r"\d+", str(result)).group()
-
-
-def getLabel(text):
-    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    try:
-        result = html.xpath(
-            "//td[contains(text(),'レーベル：')]/following-sibling::td/a/text()"
-        )[0]
-    except:
-        result = html.xpath(
-            "//td[contains(text(),'レーベル：')]/following-sibling::td/text()"
-        )[0]
-    return result
-
-
-def getNum(text):
-    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    try:
-        result = html.xpath(
-            "//td[contains(text(),'品番：')]/following-sibling::td/a/text()"
-        )[0]
-    except:
-        result = html.xpath(
-            "//td[contains(text(),'品番：')]/following-sibling::td/text()"
-        )[0]
-    return result
-
-
-def getYear(getRelease):
-    try:
-        result = str(re.search(r"\d{4}", getRelease).group())
-        return result
-    except:
-        return getRelease
-
-
-def getRelease(text):
-    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    try:
-        result = html.xpath(
-            "//td[contains(text(),'発売日：')]/following-sibling::td/a/text()"
-        )[0].lstrip("\n")
-    except:
-        try:
-            result = html.xpath(
-                "//td[contains(text(),'発売日：')]/following-sibling::td/text()"
-            )[0].lstrip("\n")
-        except:
-            result = "----"
-    if result == "----":
-        try:
-            result = html.xpath(
-                "//td[contains(text(),'配信開始日：')]/following-sibling::td/a/text()"
-            )[0].lstrip("\n")
-        except:
-            try:
-                result = html.xpath(
-                    "//td[contains(text(),'配信開始日：')]/following-sibling::td/text()"
-                )[0].lstrip("\n")
-            except:
-                pass
-    return result.replace("/", "-")
-
-
-def getTag(text):
-    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    try:
-        result = html.xpath(
-            "//td[contains(text(),'ジャンル：')]/following-sibling::td/a/text()"
-        )
-        return result
-    except:
-        result = html.xpath(
-            "//td[contains(text(),'ジャンル：')]/following-sibling::td/text()"
-        )
-        return result
-
-
-def getCover(text, number):
-    html = etree.fromstring(text, etree.HTMLParser())
+def getCover(html, number):
    cover_number = number
    try:
        result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
@@ -151,29 +54,11 @@ def getCover(text, number):
    return result


-def getDirector(text):
-    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getOutline(html):
    try:
-        result = html.xpath(
-            "//td[contains(text(),'監督：')]/following-sibling::td/a/text()"
-        )[0]
-    except:
-        result = html.xpath(
-            "//td[contains(text(),'監督：')]/following-sibling::td/text()"
-        )[0]
-    return result
-
-
-def getOutline(text):
-    html = etree.fromstring(text, etree.HTMLParser())
-    try:
-        result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
-            "\n", ""
-        )
+        result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
        if result == "":
-            result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
-                "\n", ""
-            )
+            result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
    except:
        # (TODO) handle more edge case
        # print(html)
@@ -181,23 +66,8 @@ def getOutline(text):
    return result


-def getSeries(text):
-    try:
-        html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-        try:
-            result = html.xpath(
-                "//td[contains(text(),'シリーズ：')]/following-sibling::td/a/text()"
-            )[0]
-        except:
-            result = html.xpath(
-                "//td[contains(text(),'シリーズ：')]/following-sibling::td/text()"
-            )[0]
-        return result
-    except:
-        return ""
-
 def getExtrafanart(htmlcode):  # 获取剧照
-    html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
+    html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div></div>')
    html = html_pather.search(htmlcode)
    if html:
        html = html.group()
@@ -232,6 +102,7 @@ def main(number):
        "https://www.dmm.co.jp/rental/-/detail/=/cid=",
    ]
    chosen_url = ""
+    fanza_Crawler = ''

    for url in fanza_urls:
        chosen_url = url + fanza_search_number
@@ -240,6 +111,7 @@ def main(number):
                urlencode({"rurl": chosen_url})
            )
        )
+        fanza_Crawler = fanzaCrawler(htmlcode)
        if "404 Not Found" not in htmlcode:
            break
    if "404 Not Found" in htmlcode:
@@ -249,28 +121,34 @@ def main(number):
        # for example, the url will be cid=test012
        # but the hinban on the page is test00012
        # so get the hinban first, and then pass it to following functions
-        fanza_hinban = getNum(htmlcode)
+        fanza_hinban = fanza_Crawler.getFanzaString('品番：')
+        out_num = fanza_hinban
+        number_lo = number.lower()
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        if (re.sub('-|_', '', number_lo) == fanza_hinban or
+            number_lo.replace('-', '00') == fanza_hinban or
+            number_lo.replace('-', '') + 'so' == fanza_hinban
+        ):
+            out_num = number
        data = {
-            "title": getTitle(htmlcode).strip(),
-            "studio": getStudio(htmlcode),
-            "outline": getOutline(htmlcode),
-            "runtime": getRuntime(htmlcode),
-            "director": getDirector(htmlcode) if "anime" not in chosen_url else "",
-            "actor": getActor(htmlcode) if "anime" not in chosen_url else "",
-            "release": getRelease(htmlcode),
-            "number": fanza_hinban,
-            "cover": getCover(htmlcode, fanza_hinban),
+            "title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
+            "studio": fanza_Crawler.getFanzaString('メーカー'),
+            "outline": getOutline(html),
+            "runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
+            "director": fanza_Crawler.getFanzaString('監督：') if "anime" not in chosen_url else "",
+            "actor": fanza_Crawler.getString("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()").replace("', '", ",") if "anime" not in chosen_url else "",
+            "release": getRelease(fanza_Crawler),
+            "number": out_num,
+            "cover": getCover(html, fanza_hinban),
            "imagecut": 1,
-            "tag": getTag(htmlcode),
+            "tag": fanza_Crawler.getFanzaStrings('ジャンル：'),
            "extrafanart": getExtrafanart(htmlcode),
-            "label": getLabel(htmlcode),
-            "year": getYear(
-                getRelease(htmlcode)
-            ),  # str(re.search('\d{4}',getRelease(a)).group()),
+            "label": fanza_Crawler.getFanzaString('レーベル'),
+            "year": re.findall('\d{4}',getRelease(fanza_Crawler))[0],  # str(re.search('\d{4}',getRelease(a)).group()),
            "actor_photo": "",
            "website": chosen_url,
            "source": "fanza.py",
-            "series": getSeries(htmlcode),
+            "series": fanza_Crawler.getFanzaString('シリーズ：'),
        }
    except:
        data = {
@@ -314,4 +192,6 @@ def main_htmlcode(number):
 if __name__ == "__main__":
    # print(main("DV-1562"))
    # print(main("96fad1217"))
-    print(main("h_173ghmt68"))
+    print(main("pred00251"))
+    print(main("MIAA-391"))
+    print(main("OBA-326"))
--- a/WebCrawler/fc2.py
+++ b/WebCrawler/fc2.py
@@ -4,58 +4,11 @@ import re
 from lxml import etree#need install
 import json
 import ADC_function
+from WebCrawler.crawler import *
 # import sys
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)

-def getTitle_fc2com(htmlcode): #获取厂商
-    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
-    return result
-def getActor_fc2com(htmlcode):
-    try:
-        html = etree.fromstring(htmlcode, etree.HTMLParser())
-        result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
-        return result
-    except:
-        return ''
-def getStudio_fc2com(htmlcode): #获取厂商
-    try:
-        html = etree.fromstring(htmlcode, etree.HTMLParser())
-        result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
-        return result
-    except:
-        return ''
-def getNum_fc2com(htmlcode):     #获取番号
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
-    return result
-def getRelease_fc2com(htmlcode2): #
-    html=etree.fromstring(htmlcode2,etree.HTMLParser())
-    result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()')).strip(" ['販売日 : ']").replace('/','-')
-    return result
-def getCover_fc2com(htmlcode2): #获取厂商 #
-    html = etree.fromstring(htmlcode2, etree.HTMLParser())
-    result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
-    return 'http:' + result
-# def getOutline_fc2com(htmlcode2):     #获取番号 #
-#     xpath_html = etree.fromstring(htmlcode2, etree.HTMLParser())
-#     path = str(xpath_html.xpath('//*[@id="top"]/div[1]/section[4]/iframe/@src')).strip(" ['']")
-#     html = etree.fromstring(ADC_function.get_html('https://adult.contents.fc2.com/'+path), etree.HTMLParser())
-#     print('https://adult.contents.fc2.com'+path)
-#     print(ADC_function.get_html('https://adult.contents.fc2.com'+path,cookies={'wei6H':'1'}))
-#     result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip('  ').replace('。,',',')
-#     return result
-def getTag_fc2com(lx):
-    result = lx.xpath("//a[@class='tag tagTag']/text()")
-    return result
-def getYear_fc2com(release):
-    try:
-        result = re.search('\d{4}',release).group()
-        return result
-    except:
-        return ''
-
 def getExtrafanart(htmlcode):  # 获取剧照
    html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
    html = html_pather.search(htmlcode)
@@ -79,27 +32,30 @@ def getTrailer(htmlcode, number):
        except:
            return ''
    else:
-        video_url = ''
+        return ''

 def main(number):
    try:
        number = number.replace('FC2-', '').replace('fc2-', '')
-        htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/')
-        actor = getActor_fc2com(htmlcode2)
-        if not actor:
+        htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
+        fc2_crawler = Crawler(htmlcode2)
+        actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
+        if actor == "":
            actor = '素人'
        lx = etree.fromstring(htmlcode2, etree.HTMLParser())
-        cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
+        cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
        cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
+        release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
+            strip(" ['販売日 : ']").replace('/','-')
        dic = {
-            'title': lx.xpath('/html/head/title/text()')[0],
-            'studio': getStudio_fc2com(htmlcode2),
-            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
+            'title': fc2_crawler.getString('/html/head/title/text()'),
+            'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
+            'year': re.findall('\d{4}',release)[0],
            'outline': '',  # getOutline_fc2com(htmlcode2),
            'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
-            'director': getStudio_fc2com(htmlcode2),
+            'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
            'actor': actor,
-            'release': getRelease_fc2com(htmlcode2),
+            'release': release,
            'number': 'FC2-' + number,
            'label': '',
            'cover': cover,
@@ -107,7 +63,7 @@ def main(number):
            'extrafanart': getExtrafanart(htmlcode2),
            "trailer": getTrailer(htmlcode2, number),
            'imagecut': 0,
-            'tag': getTag_fc2com(lx),
+            'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
            'actor_photo': '',
            'website': 'https://adult.contents.fc2.com/article/' + number + '/',
            'source': 'https://adult.contents.fc2.com/article/' + number + '/',
@@ -121,6 +77,4 @@ def main(number):
    return js

 if __name__ == '__main__':
-    print(main('FC2-1787685'))
-    print(main('FC2-2086710'))
-
+    print(main('FC2-2182382'))
--- a/WebCrawler/gcolle.py
+++ b/WebCrawler/gcolle.py
@@ -0,0 +1,88 @@
+import sys
+sys.path.append('../')
+
+from WebCrawler.crawler import *
+from ADC_function import *
+from lxml import etree
+
+
+def main(number):
+    save_cookies = False
+    cookie_filename = 'gcolle.json'
+    try:
+        gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
+        session = get_html_session(cookies=gcolle_cooikes)
+        number = number.upper().replace('GCOLLE-','')
+
+        htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
+        gcolle_crawler = Crawler(htmlcode)
+        r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
+        if r18_continue and r18_continue.startswith('http'):
+            htmlcode = session.get(r18_continue).text
+            gcolle_crawler = Crawler(htmlcode)
+            save_cookies = True
+            cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
+
+        number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
+        if number != number_html:
+            raise Exception('[-]gcolle.py: number not match')
+
+        if save_cookies:
+            cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
+            cookies_save.parent.mkdir(parents=True, exist_ok=True)
+            cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
+
+        # get extrafanart url
+        if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
+            extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
+        else:
+            extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
+        # Add "https:" in each extrafanart url
+        for i in range(len(extrafanart)):
+            extrafanart[i] = 'https:' + extrafanart[i]
+
+        dic = {
+            "title":      gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
+            "studio":     gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
+            "year":       re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
+            "outline":    gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
+            "runtime":    '',
+            "director":   gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
+            "actor":      gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
+            "release":    re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
+            "number":     "GCOLLE-" + str(number_html),
+            "cover":      "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
+            "thumb":      "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
+            "trailer":    '',
+            "actor_photo":'',
+            "imagecut":   4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
+            "tag":        gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
+            "extrafanart":extrafanart,
+            "label":      gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
+            "website":    'https://gcolle.net/product_info.php/products_id/' + number,
+            "source":     'gcolle.py',
+            "series":     gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
+            '无码': False,
+        }
+        # for k,v in dic.items():
+        #     if k == 'outline':
+        #         print(k,len(v))
+        #     else:
+        #         print(k,v)
+        # print('===============================================================')
+    except Exception as e:
+        dic = {'title':''}
+        if config.getInstance().debug():
+            print(e)
+
+    return dic
+
+if __name__ == '__main__':
+    from pprint import pprint
+    config.getInstance().set_override("debug_mode:switch=1")
+    pprint(main('840724'))
+    pprint(main('840386'))
+    pprint(main('838671'))
+    pprint(main('814179'))
+    pprint(main('834255'))
+    pprint(main('814179'))
--- a/WebCrawler/jav321.py
+++ b/WebCrawler/jav321.py
@@ -56,9 +56,9 @@ def parse_info(soup: BeautifulSoup) -> dict:
            "label": get_label(data_dic),
            "studio": get_studio(data_dic),
            "tag": get_tag(data_dic),
-            "number": get_number(data_dic),
+            "number": get_number(data_dic).upper(),
            "release": get_release(data_dic),
-            "runtime": get_runtime(data_dic),
+            "runtime": get_runtime(data_dic).replace(" minutes", ""),
            "series": get_series(data_dic),
        }
    else:
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -60,10 +60,10 @@ def getCID(html):
    string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
    result = re.sub('/.*?.jpg','',string)
    return result
-def getOutline(number, title):  #获取剧情介绍 多进程并发查询
+def getOutline(number, title, uncensored):  #获取剧情介绍 多进程并发查询
    if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
        return ''   # 从airav.py过来的调用不计算outline直接返回，避免重复抓取数据拖慢处理速度
-    return getStoryline(number,title)
+    return getStoryline(number,title, 无码=uncensored)
 def getSeriseJa(html):
    x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
    return str(x[0]) if len(x) else ''
@@ -83,9 +83,13 @@ def getExtrafanart(htmlcode):  # 获取剧照
        if extrafanart_imgs:
            return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
    return ''
+def getUncensored(html):
+    x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
+    return bool(x)

 def main_uncensored(number):
-    htmlcode = get_html('https://www.javbus.com/ja/' + number)
+    w_number = number.replace('.', '-')
+    htmlcode = get_html('https://www.javbus.red/' + w_number)
    if "<title>404 Page Not Found" in htmlcode:
        raise Exception('404 page not found')
    lx = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -94,7 +98,7 @@ def main_uncensored(number):
        'title': title,
        'studio': getStudioJa(lx),
        'year': getYear(lx),
-        'outline': getOutline(number, title),
+        'outline': getOutline(w_number, title, True),
        'runtime': getRuntime(lx),
        'director': getDirectorJa(lx),
        'actor': getActor(lx),
@@ -106,9 +110,10 @@ def main_uncensored(number):
        'label': getSeriseJa(lx),
        'imagecut': 0,
 #        'actor_photo': '',
-        'website': 'https://www.javbus.com/ja/' + number,
+        'website': 'https://www.javbus.red/' + w_number,
        'source': 'javbus.py',
        'series': getSeriseJa(lx),
+        '无码': True
    }
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
    return js
@@ -136,7 +141,7 @@ def main(number):
                'title': title,
                'studio': getStudio(lx),
                'year': getYear(lx),
-                'outline': getOutline(number, title),
+                'outline': getOutline(number, title, getUncensored(lx)),
                'runtime': getRuntime(lx),
                'director': getDirector(lx),
                'actor': getActor(lx),
@@ -151,6 +156,7 @@ def main(number):
                'website': 'https://www.javbus.com/' + number,
                'source': 'javbus.py',
                'series': getSerise(lx),
+                '无码': getUncensored(lx)
            }
            js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )  # .encode('UTF-8')
            return js
@@ -168,13 +174,14 @@ def main(number):
        return js

 if __name__ == "__main__" :
-    config.G_conf_override['debug_mode:switch'] = True
-    print(main('ABP-888'))
-    print(main('ABP-960'))
-    print(main('ADV-R0624'))    # 404
-    print(main('MMNT-010'))
-    print(main('ipx-292'))
-    print(main('CEMD-011'))
-    print(main('CJOD-278'))
+    config.getInstance().set_override("debug_mode:switch=1")
+    # print(main('ABP-888'))
+    # print(main('ABP-960'))
+    # print(main('ADV-R0624'))    # 404
+    # print(main('MMNT-010'))
+    # print(main('ipx-292'))
+    # print(main('CEMD-011'))
+    # print(main('CJOD-278'))
+    print(main('BrazzersExxtra.21.02.01'))
    print(main('100221_001'))
    print(main('AVSW-061'))
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -166,12 +166,23 @@ def getDirector(html):
    result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
-def getOutline(number, title):  #获取剧情介绍 多进程并发查询
-    return getStoryline(number,title)
+def getOutline(number, title, uncensored):  #获取剧情介绍 多进程并发查询
+    return getStoryline(number, title, 无码=uncensored)
 def getSeries(html):
    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
+def getUserRating(html):
+    try:
+        result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
+        v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
+        return float(v[0][0]), int(v[0][1])
+    except:
+        return
+def getUncensored(html):
+    x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
+                ' or contains(@href,"/tags/western?")]')
+    return bool(x)

 def main(number):
    # javdb更新后同一时间只能登录一个数字站，最新登录站会踢出旧的登录，因此按找到的第一个javdb*.json文件选择站点，
@@ -276,7 +287,7 @@ def main(number):
            'actor': getActor(lx),
            'title': title,
            'studio': getStudio(detail_page, lx),
-            'outline': getOutline(number, title),
+            'outline': getOutline(number, title, getUncensored(lx)),
            'runtime': getRuntime(lx),
            'director': getDirector(lx),
            'release': getRelease(detail_page),
@@ -293,8 +304,12 @@ def main(number):
            'website': urljoin('https://javdb.com', correct_url),
            'source': 'javdb.py',
            'series': getSeries(lx),
-
+            '无码': getUncensored(lx)
        }
+        userrating = getUserRating(lx)
+        if isinstance(userrating, tuple) and len(userrating) == 2:
+            dic['用户评分'] = userrating[0]
+            dic['评分人数'] = userrating[1]
        if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
            dic['actor'].append('素人')
            if not dic['series']:
@@ -313,18 +328,19 @@ def main(number):
 # main('DV-1562')
 # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
 if __name__ == "__main__":
-    config.G_conf_override['debug_mode:switch'] = True
+    config.getInstance().set_override("debug_mode:switch=1")
    # print(main('blacked.20.05.30'))
    # print(main('AGAV-042'))
    # print(main('BANK-022'))
-    # print(main('070116-197'))
+    print(main('070116-197'))
    # print(main('093021_539'))  # 没有剧照 片商pacopacomama
    #print(main('FC2-2278260'))
    # print(main('FC2-735670'))
    # print(main('FC2-1174949')) # not found
    #print(main('MVSD-439'))
    # print(main('EHM0001')) # not found
-    print(main('FC2-2314275'))
+    #print(main('FC2-2314275'))
    # print(main('EBOD-646'))
    # print(main('LOVE-262'))
-    #print(main('ABP-890'))
+    print(main('ABP-890'))
+    print(main('blacked.14.12.08'))
--- a/WebCrawler/madou.py
+++ b/WebCrawler/madou.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 from bs4 import BeautifulSoup  # need install
 from lxml import etree  # need install
 from pyquery import PyQuery as pq  # need install
@@ -5,24 +7,22 @@ from ADC_function import *
 import json
 import re
 from lib2to3.pgen2 import parse
-import sys

 from urllib.parse import urlparse, unquote
-sys.path.append('../')


 def getActorPhoto(html):
    return ''


-def getTitle(html, number):  # 获取标题
-    title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
-    try:
-        result = str(re.split(r'[/|／|-]', title)[1])
-        return result.strip()
-    except:
-        return title.replace(number.upper(), '').strip()
-
+def getTitle(html):  # 获取标题
+    # <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
+    # <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
+    # <title>MD0094／贫嘴贱舌中出大嫂／坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
+    # <title>TM0002-我的痴女女友-麻豆社</title>
+    browser_title = str(html.xpath("/html/head/title/text()")[0])
+    title = str(re.findall(r'^[A-Z0-9 /／\-]*(.*)-麻豆社$', browser_title)[0]).strip()
+    return title

 def getStudio(html):  # 获取厂商 已修改
    try:
@@ -61,7 +61,6 @@ def getNum(url, number):  # 获取番号
        filename = unquote(urlparse(url).path)
        # 裁剪文件名
        result = filename[1:-5].upper().strip()
-        print(result)
        # 移除中文
        if result.upper() != number.upper():
            result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
@@ -83,13 +82,15 @@ def getSerise(html):  # 获取系列 已修改
    return ''


-def getTag(html):  # 获取标签
-    return html.xpath('//div[@class="article-tags"]/a/text()')
+def getTag(html, studio):  # 获取标签
+    x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+    return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]


 def getExtrafanart(html):  # 获取剧照
    return ''

+
 def cutTags(tags):
    actors = []
    tags = []
@@ -109,13 +110,15 @@ def main(number):

        html = etree.fromstring(htmlcode, etree.HTMLParser())
        url = getUrl(html)
-        tags = getTag(html)
-        actor,tags = cutTags(tags);
+        studio = getStudio(html)
+        tags = getTag(html, studio)
+        #actor,tags = cutTags(tags) # 演员在tags中的位置不固定，放弃尝试获取
+        actor = ''
        dic = {
            # 标题
-            'title': getTitle(html, number),
+            'title': getTitle(html),
            # 制作商
-            'studio': getStudio(html),
+            'studio': studio,
            # 年份
            'year': getYear(html),
            # 简介
@@ -143,7 +146,8 @@ def main(number):
            'website': url,
            'source': 'madou.py',
            # 使用
-            'series': getSerise(html)
+            'series': getSerise(html),
+            '无码': True
        }
        js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
                        indent=4, separators=(',', ':'), )  # .encode('UTF-8')
@@ -161,4 +165,11 @@ def main(number):


 if __name__ == '__main__':
-    print(main('MD0094'))
+    config.getInstance().set_override("debug_mode:switch=1")
+    print(main('MD0129'))
+    # print(main('TM0002'))
+    # print(main('MD0222'))
+    # print(main('MD0140-2'))
+    # print(main('MAD039'))
+    # print(main('JDMY027'))
+
--- a/WebCrawler/mgstage.py
+++ b/WebCrawler/mgstage.py
@@ -5,95 +5,28 @@ from lxml import etree
 import json
 from bs4 import BeautifulSoup
 from ADC_function import *
+from WebCrawler.crawler import *
 # import sys
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)

-def getTitle(a):
-    try:
-        html = etree.fromstring(a, etree.HTMLParser())
-        result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
-        return result.replace('/', ',')
-    except:
-        return ''
-def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
-    html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
-    result1=str(html.xpath('//th[contains(text(),"出演：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
-    result2=str(html.xpath('//th[contains(text(),"出演：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
-    return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
-def getStudio(a):
-    html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
-    result1=str(html.xpath('//th[contains(text(),"メーカー：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
-    result2=str(html.xpath('//th[contains(text(),"メーカー：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
-    return str(result1+result2).strip('+').replace("', '",'').replace('"','')
-def getRuntime(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//th[contains(text(),"収録時間：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
-    result2 = str(html.xpath('//th[contains(text(),"収録時間：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
-    return str(result1 + result2).strip('+').rstrip('mi')
-def getLabel(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//th[contains(text(),"シリーズ：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    result2 = str(html.xpath('//th[contains(text(),"シリーズ：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
-def getNum(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//th[contains(text(),"品番：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    result2 = str(html.xpath('//th[contains(text(),"品番：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    return str(result1 + result2).strip('+')
-def getYear(getRelease):
-    try:
-        result = str(re.search('\d{4}',getRelease).group())
-        return result
-    except:
-        return getRelease
-def getRelease(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//th[contains(text(),"配信開始日：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    result2 = str(html.xpath('//th[contains(text(),"配信開始日：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    return str(result1 + result2).strip('+').replace('/','-')
+class MgsCrawler(Crawler):
+    def getMgsString(self, _xpath):
+        html = self.html
+        result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n    ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
+        result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n    ').strip('\\n')
+        return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
+
 def getTag(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    result2 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
+    result1 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    result2 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
    result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
    return result
-def getCover(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('//*[@id="EnlargeImage"]/@href')).strip(" ['']")
-    # result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
-    #                    /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
-    return result
-def getDirector(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
-def getOutline(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
-    return result
-def getSeries(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
-        '\\n')
-    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')

-def getExtrafanart(htmlcode):  # 获取剧照
+def getExtrafanart(htmlcode2):  # 获取剧照
    html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
-    html = html_pather.search(htmlcode)
+    html = html_pather.search(htmlcode2)
    if html:
        html = html.group()
        extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
@@ -104,36 +37,35 @@ def getExtrafanart(htmlcode):  # 获取剧照

 def main(number2):
    number=number2.upper()
-    htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
-    soup = BeautifulSoup(htmlcode, 'lxml')
-    a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n                                        ','').replace('                                ','').replace('\n                            ','').replace('\n                        ','')
-    b = str(soup.find(attrs={'id': 'introduction'})).replace('\n                                        ','').replace('                                ','').replace('\n                            ','').replace('\n                        ','')
+    htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
+    soup = BeautifulSoup(htmlcode2, 'lxml')
+    a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n                                        ','').replace('                                ','').replace('\n                            ','').replace('\n                        ','')
+    b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n                                        ','').replace('                                ','').replace('\n                            ','').replace('\n                        ','')
+    htmlcode = MgsCrawler(htmlcode2)
+    a = MgsCrawler(a2)
+    b = MgsCrawler(b2)
    #print(b)
-    try:
-        dic = {
-            'title': getTitle(htmlcode).replace("\\n", '').replace('        ', ''),
-            'studio': getStudio(a),
-            'outline': getOutline(b),
-            'runtime': getRuntime(a),
-            'director': getDirector(a),
-            'actor': getActor(a),
-            'release': getRelease(a),
-            'number': getNum(a),
-            'cover': getCover(htmlcode),
-            'imagecut': 1,
-            'tag': getTag(a),
-            'label': getLabel(a),
-            'extrafanart': getExtrafanart(htmlcode),
-            'year': getYear(getRelease(a)),  # str(re.search('\d{4}',getRelease(a)).group()),
-            'actor_photo': '',
-            'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
-            'source': 'mgstage.py',
-            'series': getSeries(a),
-        }
-    except Exception as e:
-        if config.getInstance().debug():
-            print(e)
-        dic = {"title": ""}
+    dic = {
+        'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace('        ', '').strip(),
+        'studio': a.getMgsString('//th[contains(text(),"メーカー：")]/../td/a/text()'),
+        'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
+        'runtime': a.getMgsString('//th[contains(text(),"収録時間：")]/../td/a/text()').rstrip('mi'),
+        'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
+        'actor': a.getMgsString('//th[contains(text(),"出演：")]/../td/a/text()'),
+        'release': a.getMgsString('//th[contains(text(),"配信開始日：")]/../td/a/text()').replace('/','-'),
+        'number': a.getMgsString('//th[contains(text(),"品番：")]/../td/a/text()'),
+        'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
+        'imagecut': 1,
+        'tag': getTag(a2),
+        'label': a.getMgsString('//th[contains(text(),"シリーズ：")]/../td/a/text()'),
+        'extrafanart': getExtrafanart(htmlcode2),
+        'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日：")]/../td/a/text()'))).strip(" ['']"),
+        # str(re.search('\d{4}',getRelease(a)).group()),
+        'actor_photo': '',
+        'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
+        'source': 'mgstage.py',
+        'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
+    }

    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
    return js
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -5,7 +5,6 @@ import json
 import builtins
 from ADC_function import *
 from lxml.html import fromstring
-from multiprocessing import Pool
 from multiprocessing.dummy import Pool as ThreadPool
 from difflib import SequenceMatcher
 from unicodedata import category
@@ -13,7 +12,7 @@ from number_parser import is_uncensored

 G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}

-G_mode_txt = ('顺序执行','线程池','进程池')
+G_mode_txt = ('顺序执行','线程池')

 class noThread(object):
    def map(self, fn, param):
@@ -25,14 +24,15 @@ class noThread(object):


 # 获取剧情介绍 从列表中的站点同时查，取值优先级从前到后
-def getStoryline(number, title, sites: list=None):
+def getStoryline(number, title, sites: list=None, 无码=None):
    start_time = time.time()
    conf = config.getInstance()
    if not conf.is_storyline():
        return ''
    debug = conf.debug() or conf.storyline_show() == 2
    storyine_sites = conf.storyline_site().split(',') if sites is None else sites
-    if is_uncensored(number):
+    unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
+    if unc:
        storyine_sites += conf.storyline_uncensored_site().split(',')
    else:
        storyine_sites += conf.storyline_censored_site().split(',')
@@ -49,9 +49,8 @@ def getStoryline(number, title, sites: list=None):
    cores = min(len(apply_sites), os.cpu_count())
    if cores == 0:
        return ''
-    run_mode = conf.storyline_mode()
-    assert run_mode in (0,1,2)
-    with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
+    run_mode = 1 if conf.storyline_mode() > 0 else 0
+    with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
        results = pool.map(getStoryline_mp, mp_args)
    sel = ''
    if not debug and conf.storyline_show() == 0:
@@ -62,7 +61,7 @@ def getStoryline(number, title, sites: list=None):
                if not len(sel):
                    sel = value
        return sel
-    # 以下debug结果输出会写入日志，进程池中的则不会，只在标准输出中显示
+    # 以下debug结果输出会写入日志
    s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒，结束于{time.strftime("%H:%M:%S")}'
    sel_site = ''
    for site, desc in zip(apply_sites, results):
@@ -80,34 +79,33 @@ def getStoryline(number, title, sites: list=None):


 def getStoryline_mp(args):
-    def _inner(site, number, title, debug):
-        start_time = time.time()
-        storyline = None
-        if not isinstance(site, str):
-            return storyline
-        elif site == "airavwiki":
-            storyline = getStoryline_airavwiki(number, debug)
-        elif site == "airav":
-            storyline = getStoryline_airav(number, debug)
-        elif site == "avno1":
-            storyline = getStoryline_avno1(number, debug)
-        elif site == "xcity":
-            storyline = getStoryline_xcity(number, debug)
-        elif site == "amazon":
-            storyline = getStoryline_amazon(title, number, debug)
-        elif site == "58avgo":
-            storyline = getStoryline_58avgo(number, debug)
-        if not debug:
-            return storyline
-        # 进程池模式的子进程getStoryline_*()的print()不会写入日志中，线程池和顺序执行不受影响
-        print("[!]MP 进程[{}]运行{:.3f}秒，结束于{}返回结果: {}".format(
-                site,
-                time.time() - start_time,
-                time.strftime("%H:%M:%S"),
-                storyline if isinstance(storyline, str) and len(storyline) else '[空]')
-        )
+    (site, number, title, debug) = args
+    start_time = time.time()
+    storyline = None
+    if not isinstance(site, str):
        return storyline
-    return _inner(*args)
+    elif site == "airavwiki":
+        storyline = getStoryline_airavwiki(number, debug)
+        #storyline = getStoryline_airavwiki_super(number, debug)
+    elif site == "airav":
+        storyline = getStoryline_airav(number, debug)
+    elif site == "avno1":
+        storyline = getStoryline_avno1(number, debug)
+    elif site == "xcity":
+        storyline = getStoryline_xcity(number, debug)
+    elif site == "amazon":
+        storyline = getStoryline_amazon(title, number, debug)
+    elif site == "58avgo":
+        storyline = getStoryline_58avgo(number, debug)
+    if not debug:
+        return storyline
+    print("[!]MP 线程[{}]运行{:.3f}秒，结束于{}返回结果: {}".format(
+            site,
+            time.time() - start_time,
+            time.strftime("%H:%M:%S"),
+            storyline if isinstance(storyline, str) and len(storyline) else '[空]')
+    )
+    return storyline


 def getStoryline_airav(number, debug):
@@ -308,8 +306,8 @@ def getStoryline_amazon(q_title, number, debug):
            res = session.get(urljoin(res.url, lks[0]))
            cookie = None
            lx = fromstring(res.text)
-        titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
-        urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
+        titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
+        urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
        if not len(urls) or len(urls) != len(titles):
            raise ValueError("titles not found")
        idx = amazon_select_one(titles, q_title, number, debug)
@@ -325,8 +323,9 @@ def getStoryline_amazon(q_title, number, debug):
            res = session.get(urljoin(res.url, lks[0]))
            cookie = None
            lx = fromstring(res.text)
-        div = lx.xpath('//*[@id="productDescription"]')[0]
-        ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)])
+        p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
+        p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
+        ama_t = ' '.join(p1) + ' '.join(p2)
        ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()

        if cookie is None:
@@ -406,10 +405,10 @@ def amazon_select_one(a_titles, q_title, number, debug):
    # debug 模式下记录识别准确率日志
    if ratio < 0.9:
        # 相似度[0.5, 0.9)的淘汰结果单独记录日志
-        (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
-            f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
+        with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
+            hrt.write(f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
        return -1
    # 被采信的结果日志
-    (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
-        f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
+    with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
+        hrt.write(f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
    return sel
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -128,7 +128,7 @@ def getOutline(html, number, title):
    a = set(storyline_site) & {'airav', 'avno1'}  # 只要中文的简介文字
    if len(a):
        site = [n for n in storyline_site if n in a]
-        g = getStoryline(number, title, site)
+        g = getStoryline(number, title, site, 无码=False)
        if len(g):
            return g
    try: