Update Pre-release 3.7

2020-08-12 18:24:46 +08:00
parent e7a7e17e52
commit 72a9790858
12 changed files with 1623 additions and 2 deletions
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -33,7 +33,7 @@ def movie_lists(root, escape_folder):
        if folder in root:
            return []
    total = []
-    file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', ]
+    file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', '.iso','.ISO']
    dirs = os.listdir(root)
    for entry in dirs:
        f = os.path.join(root, entry)
@@ -110,7 +110,7 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu


 if __name__ == '__main__':
-    version = '3.6'
+    version = '3.7'

    # Parse command line args
    single_file_path, config_file, auto_exit, custom_number = argparse_function()
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -0,0 +1,124 @@
+import re
+from lxml import etree
+import json
+from bs4 import BeautifulSoup
+from ADC_function import *
+# import sys
+# import io
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
+
+def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
+    soup = BeautifulSoup(htmlcode, 'lxml')
+    a = soup.find_all(attrs={'class': 'avatar-box'})
+    d = {}
+    for i in a:
+        l = i.img['src']
+        t = i.span.get_text()
+        p2 = {t: l}
+        d.update(p2)
+    return d
+def getTitle(a):
+    try:
+        html = etree.fromstring(a, etree.HTMLParser())
+        result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
+        return result.replace('/', '')
+    except:
+        return ''
+def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+    soup = BeautifulSoup(a, 'lxml')
+    a = soup.find_all(attrs={'class': 'avatar-box'})
+    d = []
+    for i in a:
+        d.append(i.span.get_text())
+    return d
+def getStudio(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
+    return result1
+def getRuntime(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
+    return result1
+def getLabel(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
+    return result1
+def getNum(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
+    return result1
+def getYear(release):
+    try:
+        result = str(re.search('\d{4}',release).group())
+        return result
+    except:
+        return release
+def getRelease(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
+    return result1
+def getCover(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
+    return result
+def getCover_small(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
+    return result
+def getTag(a):  # 获取演员
+    soup = BeautifulSoup(a, 'lxml')
+    a = soup.find_all(attrs={'class': 'genre'})
+    d = []
+    for i in a:
+        d.append(i.get_text())
+    return d
+def getSeries(htmlcode):
+    try:
+        html = etree.fromstring(htmlcode, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
+        return result1
+    except:
+        return ''
+
+def main(number):
+    a = get_html('https://avsox.host/cn/search/' + number)
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
+    if result1 == '' or result1 == 'null' or result1 == 'None':
+        a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_'))
+        print(a)
+        html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
+        if result1 == '' or result1 == 'null' or result1 == 'None':
+            a = get_html('https://avsox.host/cn/search/' + number.replace('_', ''))
+            print(a)
+            html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+            result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
+    web = get_html(result1)
+    soup = BeautifulSoup(web, 'lxml')
+    info = str(soup.find(attrs={'class': 'row movie'}))
+    dic = {
+        'actor': getActor(web),
+        'title': getTitle(web).strip(getNum(web)),
+        'studio': getStudio(info),
+        'outline': '',#
+        'runtime': getRuntime(info),
+        'director': '', #
+        'release': getRelease(info),
+        'number': getNum(info),
+        'cover': getCover(web),
+        'cover_small': getCover_small(a),
+        'imagecut': 3,
+        'tag': getTag(web),
+        'label': getLabel(info),
+        'year': getYear(getRelease(info)),  # str(re.search('\d{4}',getRelease(a)).group()),
+        'actor_photo': getActorPhoto(web),
+        'website': result1,
+        'source': 'avsox.py',
+        'series': getSeries(info),
+    }
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+    return js
+
+if __name__ == "__main__":
+    print(main('012717_472'))
--- a/WebCrawler/dlsite.py
+++ b/WebCrawler/dlsite.py
@@ -0,0 +1,131 @@
+import re
+from lxml import etree
+import json
+from bs4 import BeautifulSoup
+from ADC_function import *
+# import sys
+# import io
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
+#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html'))
+#title //*[@id="work_name"]/a/text()
+#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
+#release //th[contains(text(),"販売日")]/../td/a/text()
+#story //th[contains(text(),"シナリオ")]/../td/a/text()
+#senyo //th[contains(text(),"声優")]/../td/a/text()
+#tag //th[contains(text(),"ジャンル")]/../td/div/a/text()
+#jianjie //*[@id="main_inner"]/div[3]/text()
+#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src
+
+#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html
+
+def getTitle(a):
+    html = etree.fromstring(a, etree.HTMLParser())
+    result = html.xpath('//*[@id="work_name"]/a/text()')[0]
+    return result
+def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
+    return result1
+def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
+    a = actor.split(',')
+    d={}
+    for i in a:
+        p={i:''}
+        d.update(p)
+    return d
+def getStudio(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
+    return result
+def getRuntime(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').rstrip('mi')
+def getLabel(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
+    return result
+def getYear(getRelease):
+    try:
+        result = str(re.search('\d{4}', getRelease).group())
+        return result
+    except:
+        return getRelease
+def getRelease(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0]
+    return result1.replace('年','-').replace('月','-').replace('日','')
+def getTag(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()')
+        return result
+    except:
+        return ''
+
+def getCover_small(a, index=0):
+    # same issue mentioned below,
+    # javdb sometime returns multiple results
+    # DO NOT just get the firt one, get the one with correct index number
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
+        if not 'https' in result:
+            result = 'https:' + result
+        return result
+    except: # 2020.7.17 Repair Cover Url crawl
+        result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
+        if not 'https' in result:
+            result = 'https:' + result
+        return result
+def getCover(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0]
+    return result
+def getDirector(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0]
+    return result
+def getOutline(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    total = []
+    result = html.xpath('//*[@id="main_inner"]/div[3]/text()')
+    for i in result:
+        total.append(i.strip('\r\n'))
+    return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
+def getSeries(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
+    return result1
+def main(number):
+    number = number.upper()
+    htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html')
+
+    dic = {
+        'actor': getActor(htmlcode),
+        'title': getTitle(htmlcode),
+        'studio': getStudio(htmlcode),
+        'outline': getOutline(htmlcode),
+        'runtime': getRuntime(htmlcode),
+        'director': getDirector(htmlcode),
+        'release': getRelease(htmlcode),
+        'number': number,
+        'cover': 'https:' + getCover(htmlcode),
+        'cover_small': '',
+        'imagecut': 0,
+        'tag': getTag(htmlcode),
+        'label': getLabel(htmlcode),
+        'year': getYear(getRelease(htmlcode)),  # str(re.search('\d{4}',getRelease(a)).group()),
+        'actor_photo': '',
+        'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
+        'source': 'dlsite.py',
+        'series': getSeries(htmlcode),
+    }
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+    return js
+
+# main('DV-1562')
+# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
+if __name__ == "__main__":
+    print(main('VJ013479'))
--- a/WebCrawler/fanza.py
+++ b/WebCrawler/fanza.py
@@ -0,0 +1,297 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import json
+import re
+from urllib.parse import urlencode
+
+from lxml import etree
+
+from ADC_function import *
+
+# import sys
+# import io
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
+
+
+def getTitle(text):
+    html = etree.fromstring(text, etree.HTMLParser())
+    result = html.xpath('//*[starts-with(@id, "title")]/text()')[0]
+    return result
+
+
+def getActor(text):
+    # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+    html = etree.fromstring(text, etree.HTMLParser())
+    result = (
+        str(
+            html.xpath(
+                "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
+            )
+        )
+        .strip(" ['']")
+        .replace("', '", ",")
+    )
+    return result
+
+
+def getStudio(text):
+    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath(
+            "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
+        )[0]
+    except:
+        result = html.xpath(
+            "//td[contains(text(),'メーカー')]/following-sibling::td/text()"
+        )[0]
+    return result
+
+
+def getRuntime(text):
+    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
+    return re.search(r"\d+", str(result)).group()
+
+
+def getLabel(text):
+    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath(
+            "//td[contains(text(),'レーベル：')]/following-sibling::td/a/text()"
+        )[0]
+    except:
+        result = html.xpath(
+            "//td[contains(text(),'レーベル：')]/following-sibling::td/text()"
+        )[0]
+    return result
+
+
+def getNum(text):
+    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath(
+            "//td[contains(text(),'品番：')]/following-sibling::td/a/text()"
+        )[0]
+    except:
+        result = html.xpath(
+            "//td[contains(text(),'品番：')]/following-sibling::td/text()"
+        )[0]
+    return result
+
+
+def getYear(getRelease):
+    try:
+        result = str(re.search(r"\d{4}", getRelease).group())
+        return result
+    except:
+        return getRelease
+
+
+def getRelease(text):
+    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath(
+            "//td[contains(text(),'発売日：')]/following-sibling::td/a/text()"
+        )[0].lstrip("\n")
+    except:
+        try:
+            result = html.xpath(
+                "//td[contains(text(),'発売日：')]/following-sibling::td/text()"
+            )[0].lstrip("\n")
+        except:
+            result = "----"
+    if result == "----":
+        try:
+            result = html.xpath(
+                "//td[contains(text(),'配信開始日：')]/following-sibling::td/a/text()"
+            )[0].lstrip("\n")
+        except:
+            try:
+                result = html.xpath(
+                    "//td[contains(text(),'配信開始日：')]/following-sibling::td/text()"
+                )[0].lstrip("\n")
+            except:
+                pass
+    return result.replace("/", "-")
+
+
+def getTag(text):
+    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath(
+            "//td[contains(text(),'ジャンル：')]/following-sibling::td/a/text()"
+        )
+    except:
+        result = html.xpath(
+            "//td[contains(text(),'ジャンル：')]/following-sibling::td/text()"
+        )
+    return result
+
+
+def getCover(text, number):
+    html = etree.fromstring(text, etree.HTMLParser())
+    cover_number = number
+    try:
+        result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
+    except:
+        # sometimes fanza modify _ to \u0005f for image id
+        if "_" in cover_number:
+            cover_number = cover_number.replace("_", r"\u005f")
+        try:
+            result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
+        except:
+            # (TODO) handle more edge case
+            # print(html)
+            # raise exception here, same behavior as before
+            # people's major requirement is fetching the picture
+            raise ValueError("can not find image")
+    return result
+
+
+def getDirector(text):
+    html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath(
+            "//td[contains(text(),'監督：')]/following-sibling::td/a/text()"
+        )[0]
+    except:
+        result = html.xpath(
+            "//td[contains(text(),'監督：')]/following-sibling::td/text()"
+        )[0]
+    return result
+
+
+def getOutline(text):
+    html = etree.fromstring(text, etree.HTMLParser())
+    try:
+        result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
+            "\n", ""
+        )
+        if result == "":
+            result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
+                "\n", ""
+            )
+    except:
+        # (TODO) handle more edge case
+        # print(html)
+        return ""
+    return result
+
+
+def getSeries(text):
+    try:
+        html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        try:
+            result = html.xpath(
+                "//td[contains(text(),'シリーズ：')]/following-sibling::td/a/text()"
+            )[0]
+        except:
+            result = html.xpath(
+                "//td[contains(text(),'シリーズ：')]/following-sibling::td/text()"
+            )[0]
+        return result
+    except:
+        return ""
+
+
+def main(number):
+    # fanza allow letter + number + underscore, normalize the input here
+    # @note: I only find the usage of underscore as h_test123456789
+    fanza_search_number = number
+    # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
+    if fanza_search_number.startswith("h-"):
+        fanza_search_number = fanza_search_number.replace("h-", "h_")
+
+    fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
+
+    fanza_urls = [
+        "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
+        "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
+        "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
+        "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
+        "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
+        "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
+        "https://www.dmm.co.jp/rental/-/detail/=/cid=",
+    ]
+    chosen_url = ""
+
+    for url in fanza_urls:
+        chosen_url = url + fanza_search_number
+        htmlcode = get_html(
+            "https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format(
+                urlencode({"rurl": chosen_url})
+            )
+        )
+        if "404 Not Found" not in htmlcode:
+            break
+    if "404 Not Found" in htmlcode:
+        return json.dumps({"title": "",})
+    try:
+        # for some old page, the input number does not match the page
+        # for example, the url will be cid=test012
+        # but the hinban on the page is test00012
+        # so get the hinban first, and then pass it to following functions
+        fanza_hinban = getNum(htmlcode)
+        data = {
+            "title": getTitle(htmlcode).strip(),
+            "studio": getStudio(htmlcode),
+            "outline": getOutline(htmlcode),
+            "runtime": getRuntime(htmlcode),
+            "director": getDirector(htmlcode) if "anime" not in chosen_url else "",
+            "actor": getActor(htmlcode) if "anime" not in chosen_url else "",
+            "release": getRelease(htmlcode),
+            "number": fanza_hinban,
+            "cover": getCover(htmlcode, fanza_hinban),
+            "imagecut": 1,
+            "tag": getTag(htmlcode),
+            "label": getLabel(htmlcode),
+            "year": getYear(
+                getRelease(htmlcode)
+            ),  # str(re.search('\d{4}',getRelease(a)).group()),
+            "actor_photo": "",
+            "website": chosen_url,
+            "source": "fanza.py",
+            "series": getSeries(htmlcode),
+        }
+    except:
+        data = {
+            "title": "",
+        }
+    js = json.dumps(
+        data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+    )  # .encode('UTF-8')
+    return js
+
+
+def main_htmlcode(number):
+    # fanza allow letter + number + underscore, normalize the input here
+    # @note: I only find the usage of underscore as h_test123456789
+    fanza_search_number = number
+    # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
+    if fanza_search_number.startswith("h-"):
+        fanza_search_number = fanza_search_number.replace("h-", "h_")
+
+    fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
+
+    fanza_urls = [
+        "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
+        "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
+        "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
+        "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
+        "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
+        "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
+    ]
+    chosen_url = ""
+    for url in fanza_urls:
+        chosen_url = url + fanza_search_number
+        htmlcode = get_html(chosen_url)
+        if "404 Not Found" not in htmlcode:
+            break
+    if "404 Not Found" in htmlcode:
+        return json.dumps({"title": "",})
+    return htmlcode
+
+
+if __name__ == "__main__":
+    print(main("DV-1562"))
+    print(main("96fad1217"))
--- a/WebCrawler/fc2fans_club.py
+++ b/WebCrawler/fc2fans_club.py
@@ -0,0 +1,165 @@
+import re
+from lxml import etree#need install
+import json
+import ADC_function
+# import sys
+# import io
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
+
+def getTitle(htmlcode): #获取厂商
+    #print(htmlcode)
+    html = etree.fromstring(htmlcode,etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']")
+    result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1)
+    #print(result2)
+    return result2
+def getActor(htmlcode):
+    try:
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']")
+        return result
+    except:
+        return ''
+def getStudio(htmlcode): #获取厂商
+    html = etree.fromstring(htmlcode,etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
+    return result
+def getNum(htmlcode):     #获取番号
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
+    #print(result)
+    return result
+def getRelease(htmlcode2): #
+    #a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
+    html=etree.fromstring(htmlcode2,etree.HTMLParser())
+    result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
+    return result
+def getCover(htmlcode,number,htmlcode2): #获取厂商 #
+    #a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
+    html = etree.fromstring(htmlcode2, etree.HTMLParser())
+    result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']")
+    if result == '':
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']")
+        return 'https://fc2club.com' +  result2
+    return 'http:' + result
+def getOutline(htmlcode2):     #获取番号 #
+    html = etree.fromstring(htmlcode2, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip('  ').replace('。,',',')
+    return result
+def getTag(htmlcode):     #获取番号
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()'))
+    return result.strip(" ['']").replace("'",'').replace(' ','')
+def getYear(release):
+    try:
+        result = re.search('\d{4}',release).group()
+        return result
+    except:
+        return ''
+
+def getTitle_fc2com(htmlcode): #获取厂商
+    html = etree.fromstring(htmlcode,etree.HTMLParser())
+    result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
+    return result
+def getActor_fc2com(htmlcode):
+    try:
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
+        return result
+    except:
+        return ''
+def getStudio_fc2com(htmlcode): #获取厂商
+    try:
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
+        return result
+    except:
+        return ''
+def getNum_fc2com(htmlcode):     #获取番号
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
+    return result
+def getRelease_fc2com(htmlcode2): #
+    html=etree.fromstring(htmlcode2,etree.HTMLParser())
+    result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
+    return result
+def getCover_fc2com(htmlcode2): #获取厂商 #
+    html = etree.fromstring(htmlcode2, etree.HTMLParser())
+    result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
+    return 'http:' + result
+def getOutline_fc2com(htmlcode2):     #获取番号 #
+    html = etree.fromstring(htmlcode2, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip('  ').replace('。,',',')
+    return result
+def getTag_fc2com(number):     #获取番号
+    htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape'))
+    result = re.findall('"tag":"(.*?)"', htmlcode)
+    return result
+def getYear_fc2com(release):
+    try:
+        result = re.search('\d{4}',release).group()
+        return result
+    except:
+        return ''
+
+def main(number):
+    try:
+        number = number.replace('FC2-', '').replace('fc2-', '')
+        htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/')
+        htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html')
+        actor = getActor(htmlcode)
+        if getActor(htmlcode) == '':
+            actor = 'FC2系列'
+        dic = {
+            'title':    getTitle(htmlcode),
+            'studio':   getStudio(htmlcode),
+            'year': '',#str(re.search('\d{4}',getRelease(number)).group()),
+            'outline':  '',#getOutline(htmlcode2),
+            'runtime':  getYear(getRelease(htmlcode)),
+            'director': getStudio(htmlcode),
+            'actor':    actor,
+            'release':  getRelease(number),
+            'number':  'FC2-'+number,
+            'label': '',
+            'cover':    getCover(htmlcode,number,htmlcode2),
+            'imagecut': 0,
+            'tag':      getTag(htmlcode),
+            'actor_photo':'',
+            'website':  'https://fc2club.com//html/FC2-' + number + '.html',
+            'source':'https://fc2club.com//html/FC2-' + number + '.html',
+            'series': '',
+        }
+        if dic['title'] == '':
+            htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'})
+            actor = getActor(htmlcode)
+            if getActor(htmlcode) == '':
+                actor = 'FC2系列'
+            dic = {
+                'title': getTitle_fc2com(htmlcode2),
+                'studio': getStudio_fc2com(htmlcode2),
+                'year': '',  # str(re.search('\d{4}',getRelease(number)).group()),
+                'outline': getOutline_fc2com(htmlcode2),
+                'runtime': getYear_fc2com(getRelease(htmlcode2)),
+                'director': getStudio_fc2com(htmlcode2),
+                'actor': actor,
+                'release': getRelease_fc2com(number),
+                'number': 'FC2-' + number,
+                'cover': getCover_fc2com(htmlcode2),
+                'imagecut': 0,
+                'tag': getTag_fc2com(number),
+                'label': '',
+                'actor_photo': '',
+                'website': 'http://adult.contents.fc2.com/article/' + number + '/',
+                'source': 'http://adult.contents.fc2.com/article/' + number + '/',
+                'series': '',
+            }
+    except Exception as e:
+        # (TODO) better handle this
+        # print(e)
+        dic = {"title": ""}
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
+    return js
+
+if __name__ == '__main__':
+    print(main('1252953'))
--- a/WebCrawler/jav321.py
+++ b/WebCrawler/jav321.py
@@ -0,0 +1,156 @@
+import json
+from bs4 import BeautifulSoup
+from lxml import html
+from ADC_function import post_html
+
+
+def main(number: str) -> json:
+    result = post_html(url="https://www.jav321.com/search", query={"sn": number})
+    soup = BeautifulSoup(result.text, "html.parser")
+    lx = html.fromstring(str(soup))
+
+    if "/video/" in result.url:
+        data = parse_info(soup)
+        dic = {
+            "title": get_title(lx),
+            "year": get_year(data),
+            "outline": get_outline(lx),
+            "director": "",
+            "cover": get_cover(lx),
+            "imagecut": 1,
+            "actor_photo": "",
+            "website": result.url,
+            "source": "jav321.py",
+            **data,
+        }
+    else:
+        dic = {}
+
+    return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+
+
+def get_title(lx: html.HtmlElement) -> str:
+    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
+
+
+def parse_info(soup: BeautifulSoup) -> dict:
+    data = soup.select_one("div.row > div.col-md-9")
+
+    if data:
+        dd = str(data).split("<br/>")
+        data_dic = {}
+        for d in dd:
+            data_dic[get_bold_text(h=d)] = d
+
+        return {
+            "actor": get_actor(data_dic),
+            "label": get_label(data_dic),
+            "studio": get_studio(data_dic),
+            "tag": get_tag(data_dic),
+            "number": get_number(data_dic),
+            "release": get_release(data_dic),
+            "runtime": get_runtime(data_dic),
+            "series": get_series(data_dic),
+        }
+    else:
+        return {}
+
+
+def get_bold_text(h: str) -> str:
+    soup = BeautifulSoup(h, "html.parser")
+    if soup.b:
+        return soup.b.text
+    else:
+        return "UNKNOWN_TAG"
+
+
+def get_anchor_info(h: str) -> str:
+    result = []
+
+    data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
+    for d in data:
+        result.append(d.text)
+
+    return ",".join(result)
+
+
+def get_text_info(h: str) -> str:
+    return h.split(": ")[1]
+
+
+def get_cover(lx: html.HtmlElement) -> str:
+    return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
+
+
+def get_outline(lx: html.HtmlElement) -> str:
+    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
+
+def get_series2(lx: html.HtmlElement) -> str:
+    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
+
+
+def get_actor(data: hash) -> str:
+    if "女优" in data:
+        return get_anchor_info(data["女优"])
+    else:
+        return ""
+
+
+def get_label(data: hash) -> str:
+    if "片商" in data:
+        return get_anchor_info(data["片商"])
+    else:
+        return ""
+
+
+def get_tag(data: hash) -> str:
+    if "标签" in data:
+        return get_anchor_info(data["标签"])
+    else:
+        return ""
+
+
+def get_studio(data: hash) -> str:
+    if "片商" in data:
+        return get_anchor_info(data["片商"])
+    else:
+        return ""
+
+
+def get_number(data: hash) -> str:
+    if "番号" in data:
+        return get_text_info(data["番号"])
+    else:
+        return ""
+
+
+def get_release(data: hash) -> str:
+    if "发行日期" in data:
+        return get_text_info(data["发行日期"])
+    else:
+        return ""
+
+
+def get_runtime(data: hash) -> str:
+    if "播放时长" in data:
+        return get_text_info(data["播放时长"])
+    else:
+        return ""
+
+
+def get_year(data: hash) -> str:
+    if "release" in data:
+        return data["release"][:4]
+    else:
+        return ""
+
+
+def get_series(data: hash) -> str:
+    if "系列" in data:
+        return get_anchor_info(data["系列"])
+    else:
+        return ""
+
+
+if __name__ == "__main__":
+    print(main("soe-259"))
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -0,0 +1,167 @@
+import re
+from pyquery import PyQuery as pq#need install
+from lxml import etree#need install
+from bs4 import BeautifulSoup#need install
+import json
+from ADC_function import *
+import fanza
+
+def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
+    soup = BeautifulSoup(htmlcode, 'lxml')
+    a = soup.find_all(attrs={'class': 'star-name'})
+    d={}
+    for i in a:
+        l=i.a['href']
+        t=i.get_text()
+        html = etree.fromstring(get_html(l), etree.HTMLParser())
+        p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
+        p2={t:p}
+        d.update(p2)
+    return d
+def getTitle(htmlcode):  #获取标题
+    doc = pq(htmlcode)
+    title=str(doc('div.container h3').text()).replace(' ','-')
+    try:
+        title2 = re.sub('n\d+-','',title)
+        return title2
+    except:
+        return title
+def getStudio(htmlcode): #获取厂商
+    html = etree.fromstring(htmlcode,etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
+    return result
+def getYear(htmlcode):   #获取年份
+    html = etree.fromstring(htmlcode,etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
+    return result
+def getCover(htmlcode):  #获取封面链接
+    doc = pq(htmlcode)
+    image = doc('a.bigImage')
+    return image.attr('href')
+def getRelease(htmlcode): #获取出版日期
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
+    return result
+def getRuntime(htmlcode): #获取分钟
+    soup = BeautifulSoup(htmlcode, 'lxml')
+    a = soup.find(text=re.compile('分鐘'))
+    return a
+def getActor(htmlcode):   #获取女优
+    b=[]
+    soup=BeautifulSoup(htmlcode,'lxml')
+    a=soup.find_all(attrs={'class':'star-name'})
+    for i in a:
+        b.append(i.get_text())
+    return b
+def getNum(htmlcode):     #获取番号
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
+    return result
+def getDirector(htmlcode): #获取导演
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
+    return result
+def getCID(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    #print(htmlcode)
+    string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
+    result = re.sub('/.*?.jpg','',string)
+    return result
+def getOutline(htmlcode):  #获取演员
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    try:
+        result = html.xpath("string(//div[contains(@class,'mg-b20 lh4')])").replace('\n','')
+        return result
+    except:
+        return ''
+def getSerise(htmlcode):
+    try:
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
+        return result
+    except:
+        return ''
+def getTag(htmlcode):  # 获取演员
+    tag = []
+    soup = BeautifulSoup(htmlcode, 'lxml')
+    a = soup.find_all(attrs={'class': 'genre'})
+    for i in a:
+        if 'onmouseout' in str(i):
+            continue
+        tag.append(i.get_text())
+    return tag
+
+def main_uncensored(number):
+    htmlcode = get_html('https://www.javbus.com/' + number)
+    if getTitle(htmlcode) == '':
+        htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_'))
+    try:
+        dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
+    except:
+        dww_htmlcode = ''
+    dic = {
+        'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
+        'studio': getStudio(htmlcode),
+        'year': getYear(htmlcode),
+        'outline': getOutline(dww_htmlcode),
+        'runtime': getRuntime(htmlcode),
+        'director': getDirector(htmlcode),
+        'actor': getActor(htmlcode),
+        'release': getRelease(htmlcode),
+        'number': getNum(htmlcode),
+        'cover': getCover(htmlcode),
+        'tag': getTag(htmlcode),
+        'label': getSerise(htmlcode),
+        'imagecut': 0,
+        'actor_photo': '',
+        'website': 'https://www.javbus.com/' + number,
+        'source': 'javbus.py',
+        'series': getSerise(htmlcode),
+    }
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+    return js
+
+
+def main(number):
+    try:
+        try:
+            htmlcode = get_html('https://www.javbus.com/' + number)
+            try:
+                dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
+            except:
+                dww_htmlcode = ''
+            dic = {
+                'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
+                'studio': getStudio(htmlcode),
+                'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
+                'outline': getOutline(dww_htmlcode),
+                'runtime': getRuntime(htmlcode),
+                'director': getDirector(htmlcode),
+                'actor': getActor(htmlcode),
+                'release': getRelease(htmlcode),
+                'number': getNum(htmlcode),
+                'cover': getCover(htmlcode),
+                'imagecut': 1,
+                'tag': getTag(htmlcode),
+                'label': getSerise(htmlcode),
+                'actor_photo': getActorPhoto(htmlcode),
+                'website': 'https://www.javbus.com/' + number,
+                'source': 'javbus.py',
+                'series': getSerise(htmlcode),
+            }
+            js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,
+                            separators=(',', ':'), )  # .encode('UTF-8')
+            return js
+        except:
+            return main_uncensored(number)
+    except:
+        data = {
+            "title": "",
+        }
+        js = json.dumps(
+            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+        )
+        return js
+
+if __name__ == "__main__" :
+    print(main('ipx-292'))
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -0,0 +1,154 @@
+import re
+from lxml import etree
+import json
+from bs4 import BeautifulSoup
+from ADC_function import *
+# import sys
+# import io
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
+
+def getTitle(a):
+    html = etree.fromstring(a, etree.HTMLParser())
+    result = html.xpath("/html/body/section/div/h2/strong/text()")[0]
+    return result
+def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
+def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
+    a = actor.split(',')
+    d={}
+    for i in a:
+        p={i:''}
+        d.update(p)
+    return d
+def getStudio(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
+def getRuntime(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').rstrip('mi')
+def getLabel(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
+def getNum(a):
+    html = etree.fromstring(a, etree.HTMLParser())
+    result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
+    return str(result2 + result1).strip('+')
+def getYear(getRelease):
+    try:
+        result = str(re.search('\d{4}', getRelease).group())
+        return result
+    except:
+        return getRelease
+def getRelease(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+')
+def getTag(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
+        return result
+    except:
+        result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
+        return result
+
+def getCover_small(a, index=0):
+    # same issue mentioned below,
+    # javdb sometime returns multiple results
+    # DO NOT just get the firt one, get the one with correct index number
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
+        if not 'https' in result:
+            result = 'https:' + result
+        return result
+    except: # 2020.7.17 Repair Cover Url crawl
+        result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
+        if not 'https' in result:
+            result = 'https:' + result
+        return result
+def getCover(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    try:
+        result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
+    except: # 2020.7.17 Repair Cover Url crawl
+        result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
+    return result
+def getDirector(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
+def getOutline(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
+    return result
+def getSeries(a):
+    #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
+def main(number):
+    try:
+        number = number.upper()
+        try:
+            query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
+        except:
+            query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all')
+        html = etree.fromstring(query_result, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        # javdb sometime returns multiple results,
+        # and the first elememt maybe not the one we are looking for
+        # iterate all candidates and find the match one
+        urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
+        ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
+        correct_url = urls[ids.index(number)]
+        detail_page = get_html('https://javdb.com' + correct_url)
+
+        # If gray image exists ,then replace with normal cover
+        cover_small = getCover_small(query_result, index=ids.index(number))
+        if 'placeholder' in cover_small:
+            cover_small = getCover(detail_page)
+
+
+        dic = {
+            'actor': getActor(detail_page),
+            'title': getTitle(detail_page),
+            'studio': getStudio(detail_page),
+            'outline': getOutline(detail_page),
+            'runtime': getRuntime(detail_page),
+            'director': getDirector(detail_page),
+            'release': getRelease(detail_page),
+            'number': getNum(detail_page),
+            'cover': getCover(detail_page),
+            'cover_small': cover_small,
+            'imagecut': 3,
+            'tag': getTag(detail_page),
+            'label': getLabel(detail_page),
+            'year': getYear(getRelease(detail_page)),  # str(re.search('\d{4}',getRelease(a)).group()),
+            'actor_photo': getActorPhoto(getActor(detail_page)),
+            'website': 'https://javdb.com' + correct_url,
+            'source': 'javdb.py',
+            'series': getSeries(detail_page),
+        }
+    except Exception as e:
+        # print(e)
+        dic = {"title": ""}
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+    return js
+
+# main('DV-1562')
+# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
+if __name__ == "__main__":
+    print(main('snyz-007'))
--- a/WebCrawler/javlib.py
+++ b/WebCrawler/javlib.py
@@ -0,0 +1,110 @@
+import json
+import bs4
+from bs4 import BeautifulSoup
+from lxml import html
+from http.cookies import SimpleCookie
+
+from ADC_function import get_javlib_cookie, get_html
+
+
+def main(number: str):
+    raw_cookies, user_agent = get_javlib_cookie()
+
+    # Blank cookies mean javlib site return error
+    if not raw_cookies:
+        return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+
+    # Manually construct a dictionary
+    s_cookie = SimpleCookie()
+    s_cookie.load(raw_cookies)
+    cookies = {}
+    for key, morsel in s_cookie.items():
+        cookies[key] = morsel.value
+
+    # Scraping
+    result = get_html(
+        "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
+        cookies=cookies,
+        ua=user_agent,
+        return_type="object"
+    )
+    soup = BeautifulSoup(result.text, "html.parser")
+    lx = html.fromstring(str(soup))
+
+    if "/?v=jav" in result.url:
+        dic = {
+            "title": get_title(lx, soup),
+            "studio": get_table_el_single_anchor(soup, "video_maker"),
+            "year": get_table_el_td(soup, "video_date")[:4],
+            "outline": "",
+            "director": get_table_el_single_anchor(soup, "video_director"),
+            "cover": get_cover(lx),
+            "imagecut": 1,
+            "actor_photo": "",
+            "website": result.url,
+            "source": "javlib.py",
+            "actor": get_table_el_multi_anchor(soup, "video_cast"),
+            "label": get_table_el_td(soup, "video_label"),
+            "tag": get_table_el_multi_anchor(soup, "video_genres"),
+            "number": get_table_el_td(soup, "video_id"),
+            "release": get_table_el_td(soup, "video_date"),
+            "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
+            "series":'',
+        }
+    else:
+        dic = {}
+
+    return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+
+
+def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
+    return lx.xpath(xpath)[0].strip()
+
+
+def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
+    tag = soup.find(id=tag_id).find("a")
+
+    if tag is not None:
+        return tag.string.strip()
+    else:
+        return ""
+
+
+def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
+    tags = soup.find(id=tag_id).find_all("a")
+
+    return process(tags)
+
+
+def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
+    tags = soup.find(id=tag_id).find_all("td", class_="text")
+
+    return process(tags)
+
+
+def process(tags: bs4.element.ResultSet) -> str:
+    values = []
+    for tag in tags:
+        value = tag.string
+        if value is not None and value != "----":
+            values.append(value)
+
+    return ",".join(x for x in values if x)
+
+
+def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
+    title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
+    number = get_table_el_td(soup, "video_id")
+
+    return title.replace(number, "").strip()
+
+
+def get_cover(lx: html.HtmlComment) -> str:
+    return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
+
+
+if __name__ == "__main__":
+    lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
+    #lists = ["DVMC-003"]
+    for num in lists:
+        print(main(num))
--- a/WebCrawler/mgstage.py
+++ b/WebCrawler/mgstage.py
@@ -0,0 +1,120 @@
+import re
+from lxml import etree
+import json
+from bs4 import BeautifulSoup
+from ADC_function import *
+# import sys
+# import io
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
+
+def getTitle(a):
+    try:
+        html = etree.fromstring(a, etree.HTMLParser())
+        result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
+        return result.replace('/', ',')
+    except:
+        return ''
+def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+    html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
+    result1=str(html.xpath('//th[contains(text(),"出演：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    result2=str(html.xpath('//th[contains(text(),"出演：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
+def getStudio(a):
+    html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
+    result1=str(html.xpath('//th[contains(text(),"メーカー：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    result2=str(html.xpath('//th[contains(text(),"メーカー：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    return str(result1+result2).strip('+').replace("', '",'').replace('"','')
+def getRuntime(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"収録時間：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    result2 = str(html.xpath('//th[contains(text(),"収録時間：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    return str(result1 + result2).strip('+').rstrip('mi')
+def getLabel(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"シリーズ：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    result2 = str(html.xpath('//th[contains(text(),"シリーズ：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
+def getNum(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"品番：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    result2 = str(html.xpath('//th[contains(text(),"品番：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    return str(result1 + result2).strip('+')
+def getYear(getRelease):
+    try:
+        result = str(re.search('\d{4}',getRelease).group())
+        return result
+    except:
+        return getRelease
+def getRelease(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"配信開始日：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    result2 = str(html.xpath('//th[contains(text(),"配信開始日：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    return str(result1 + result2).strip('+').replace('/','-')
+def getTag(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    result2 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
+def getCover(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
+    #                    /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
+    return result
+def getDirector(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
+def getOutline(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
+    return result
+def getSeries(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
+def main(number2):
+    number=number2.upper()
+    htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
+    soup = BeautifulSoup(htmlcode, 'lxml')
+    a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n                                        ','').replace('                                ','').replace('\n                            ','').replace('\n                        ','')
+    b = str(soup.find(attrs={'id': 'introduction'})).replace('\n                                        ','').replace('                                ','').replace('\n                            ','').replace('\n                        ','')
+    #print(b)
+    dic = {
+        'title': getTitle(htmlcode).replace("\\n",'').replace('        ',''),
+        'studio': getStudio(a),
+        'outline': getOutline(b),
+        'runtime': getRuntime(a),
+        'director': getDirector(a),
+        'actor': getActor(a),
+        'release': getRelease(a),
+        'number': getNum(a),
+        'cover': getCover(htmlcode),
+        'imagecut': 0,
+        'tag': getTag(a),
+        'label':getLabel(a),
+        'year': getYear(getRelease(a)),  # str(re.search('\d{4}',getRelease(a)).group()),
+        'actor_photo': '',
+        'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
+        'source': 'mgstage.py',
+        'series': getSeries(a),
+    }
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+    return js
+    #print(htmlcode)
+
+if __name__ == '__main__':
+    print(main('SIRO-4149'))
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -0,0 +1,192 @@
+import re
+from lxml import etree
+import json
+from bs4 import BeautifulSoup
+from ADC_function import *
+
+
+# import sys
+# import io
+# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
+
+def getTitle(a):
+    html = etree.fromstring(a, etree.HTMLParser())
+    result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
+    return result
+
+
+def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0]
+    return result1
+
+
+def getActorPhoto(actor):  # //*[@id="star_qdt"]/li/a/img
+    a = actor.split(',')
+    d = {}
+    for i in a:
+        p = {i: ''}
+        d.update(p)
+    return d
+
+
+def getStudio(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
+    except:
+        result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
+    return result.strip('+').replace("', '", '').replace('"', '')
+
+
+def getRuntime(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
+    except:
+        return ''
+    try:
+        return re.findall('\d+',result1)[0]
+    except:
+        return ''
+
+
+def getLabel(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
+        return result
+    except:
+        return ''
+
+
+def getNum(a):
+    html = etree.fromstring(a, etree.HTMLParser())
+    try:
+        result = html.xpath('//*[@id="hinban"]/text()')[0]
+        return result
+    except:
+        return ''
+
+
+def getYear(getRelease):
+    try:
+        result = str(re.search('\d{4}', getRelease).group())
+        return result
+    except:
+        return getRelease
+
+
+def getRelease(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
+    except:
+        return ''
+    try:
+        return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
+    except:
+        return ''
+
+
+def getTag(a):
+    result2=[]
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()')
+    for i in result1:
+        i=i.replace(u'\n','')
+        i=i.replace(u'\t','')
+        result2.append(i)
+    return result2
+
+
+def getCover_small(a, index=0):
+    # same issue mentioned below,
+    # javdb sometime returns multiple results
+    # DO NOT just get the firt one, get the one with correct index number
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
+    if not 'https' in result:
+        result = 'https:' + result
+    return result
+
+
+def getCover(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    try:
+        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
+        return 'https:' + result
+    except:
+        return ''
+
+
+def getDirector(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    try:
+        result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
+        return result
+    except:
+        return ''
+
+
+def getOutline(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    try:
+        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0]
+    except:
+        return ''
+    try:
+        return re.sub('\\\\\w*\d+','',result)
+    except:
+        return result
+
+def getSeries(htmlcode):
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    try:
+        try:
+            result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
+            return result
+        except:
+            result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0]
+            return result
+    except:
+        return ''
+
+
+def main(number):
+    try:
+        number = number.upper()
+        query_result = get_html(
+            'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
+        html = etree.fromstring(query_result, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
+        detail_page = get_html('https://xcity.jp' + urls)
+        dic = {
+            'actor': getActor(detail_page),
+            'title': getTitle(detail_page),
+            'studio': getStudio(detail_page),
+            'outline': getOutline(detail_page),
+            'runtime': getRuntime(detail_page),
+            'director': getDirector(detail_page),
+            'release': getRelease(detail_page),
+            'number': getNum(detail_page),
+            'cover': getCover(detail_page),
+            'cover_small': '',
+            'imagecut': 1,
+            'tag': getTag(detail_page),
+            'label': getLabel(detail_page),
+            'year': getYear(getRelease(detail_page)),  # str(re.search('\d{4}',getRelease(a)).group()),
+            'actor_photo': getActorPhoto(getActor(detail_page)),
+            'website': 'https://xcity.jp' + urls,
+            'source': 'xcity.py',
+            'series': getSeries(detail_page),
+        }
+    except Exception as e:
+        # print(e)
+        dic = {"title": ""}
+
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+    return js
+
+if __name__ == '__main__':
+    print(main('VNDS-2624'))
--- a/core.py
+++ b/core.py
@@ -17,6 +17,7 @@ import javdb
 import mgstage
 import xcity
 import javlib
+import dlsite


 def escape_path(path, escape_literals: str):  # Remove escape literals
@@ -56,6 +57,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config):  # 从JSON
        "jav321": jav321.main,
        "xcity": xcity.main,
        "javlib": javlib.main,
+        "dlsite": dlsite.main,
    }

    # default fetch order list, from the beginning to the end
@@ -74,6 +76,9 @@ def get_data_from_json(file_number, filepath, conf: config.Config):  # 从JSON
    elif "fc2" in file_number or "FC2" in file_number:
        sources.insert(0, sources.pop(sources.index("fc2")))

+    elif "RJ" in file_number or "rj" or "VJ" or "vj" in file_number:
+        sources.insert(0, sources.pop(sources.index("dlsite")))
+
    json_data = {}
    for source in sources:
        json_data = json.loads(func_mapping[source](file_number))