Update 3.5

2020-06-21 23:53:08 +08:00
parent b016113fc1
commit 2c6169b340
10 changed files with 117 additions and 42 deletions
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -86,7 +86,7 @@ def create_data_and_move(file_path: str, c: config.Config):


 if __name__ == '__main__':
-    version = '3.4.3'
+    version = '3.5'

    # Parse command line args
    single_file_path, config_file, auto_exit = argparse_function()
--- a/avsox.py
+++ b/avsox.py
@@ -72,6 +72,13 @@ def getTag(a):  # 获取演员
    for i in a:
        d.append(i.get_text())
    return d
+def getSeries(htmlcode):
+    try:
+        html = etree.fromstring(htmlcode, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
+        return result1
+    except:
+        return ''

 def main(number):
    a = get_html('https://avsox.host/cn/search/' + number)
@@ -108,8 +115,10 @@ def main(number):
        'actor_photo': getActorPhoto(web),
        'website': result1,
        'source': 'avsox.py',
+        'series': getSeries(info),
    }
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
    return js

-#print(main('012717_472'))
+if __name__ == "__main__":
+    print(main('012717_472'))
--- a/core.py
+++ b/core.py
@@ -98,6 +98,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config):  # 从JSON
    runtime = json_data['runtime']
    outline = json_data['outline']
    label = json_data['label']
+    series = json_data['series']
    year = json_data['year']
    try:
        cover_small = json_data['cover_small']
@@ -166,7 +167,8 @@ def get_info(json_data):  # 返回json里的数据
    number = json_data['number']
    cover = json_data['cover']
    website = json_data['website']
-    return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website
+    series = json_data['series']
+    return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series


 def small_cover_check(path, number, cover_small, c_word, conf: config.Config, filepath, failed_folder):
@@ -263,7 +265,7 @@ def image_download(cover, number, c_word, path, conf: config.Config, filepath, f


 def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, failed_folder, tag, actor_list, liuchu):
-    title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website = get_info(json_data)
+    title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series = get_info(json_data)

    try:
        if not os.path.exists(path):
@@ -300,6 +302,7 @@ def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, fa
            try:
                for i in tag:
                    print("  <tag>" + i + "</tag>", file=code)
+                print("  <tag>" + series + "</tag>", file=code)
            except:
                aaaaa = ''
            try:
--- a/fanza.py
+++ b/fanza.py
@@ -108,7 +108,7 @@ def getRelease(text):
                )[0].lstrip("\n")
            except:
                pass
-    return result
+    return result.replace('/','-')


 def getTag(text):
@@ -174,6 +174,23 @@ def getOutline(text):
    return result


+def getSeries(text):
+    try:
+        html = etree.fromstring(text, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        try:
+            result = html.xpath(
+                "//td[contains(text(),'シリーズ：')]/following-sibling::td/a/text()"
+            )[0]
+        except:
+            result = html.xpath(
+                "//td[contains(text(),'シリーズ：')]/following-sibling::td/text()"
+            )[0]
+        return result
+    except:
+        return ''
+
+
+
 def main(number):
    # fanza allow letter + number + underscore, normalize the input here
    # @note: I only find the usage of underscore as h_test123456789
@@ -225,6 +242,7 @@ def main(number):
            "actor_photo": "",
            "website": chosen_url,
            "source": "fanza.py",
+            "series": getSeries(htmlcode),
        }
    except:
        data = {
@@ -266,7 +284,4 @@ def main_htmlcode(number):


 if __name__ == "__main__":
-    # print(main("DV-1562"))
-    # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
-    # print(main("ipx292"))
-    pass
+    print(main("DV-1562"))
--- a/fc2fans_club.py
+++ b/fc2fans_club.py
@@ -159,5 +159,5 @@ def main(number):
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
    return js

-
-#print(main('1252953'))
+if __name__ == '__main__':
+    print(main('1252953'))
--- a/jav321.py
+++ b/jav321.py
@@ -13,7 +13,6 @@ def main(number: str) -> json:
        data = parse_info(soup)
        dic = {
            "title": get_title(lx),
-            "studio": "",
            "year": get_year(data),
            "outline": get_outline(lx),
            "director": "",
@@ -46,10 +45,12 @@ def parse_info(soup: BeautifulSoup) -> dict:
        return {
            "actor": get_actor(data_dic),
            "label": get_label(data_dic),
+            "studio": get_studio(data_dic),
            "tag": get_tag(data_dic),
            "number": get_number(data_dic),
            "release": get_release(data_dic),
            "runtime": get_runtime(data_dic),
+            "series": get_series(data_dic),
        }
    else:
        return {}
@@ -84,6 +85,9 @@ def get_cover(lx: html.HtmlElement) -> str:
 def get_outline(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]

+def get_series2(lx: html.HtmlElement) -> str:
+    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
+

 def get_actor(data: hash) -> str:
    if "女优" in data:
@@ -106,6 +110,13 @@ def get_tag(data: hash) -> str:
        return ""


+def get_studio(data: hash) -> str:
+    if "片商" in data:
+        return get_anchor_info(data["片商"])
+    else:
+        return ""
+
+
 def get_number(data: hash) -> str:
    if "番号" in data:
        return get_text_info(data["番号"])
@@ -134,5 +145,12 @@ def get_year(data: hash) -> str:
        return ""


+def get_series(data: hash) -> str:
+    if "系列" in data:
+        return get_anchor_info(data["系列"])
+    else:
+        return ""
+
+
 if __name__ == "__main__":
-    print(main("wmc-002"))
+    print(main("soe-259"))
--- a/javbus.py
+++ b/javbus.py
@@ -75,9 +75,12 @@ def getOutline(htmlcode):  #获取演员
    except:
        return ''
 def getSerise(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
-    return result
+    try:
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
+        return result
+    except:
+        return ''
 def getTag(htmlcode):  # 获取演员
    tag = []
    soup = BeautifulSoup(htmlcode, 'lxml')
@@ -113,6 +116,7 @@ def main_uncensored(number):
        'actor_photo': '',
        'website': 'https://www.javbus.com/' + number,
        'source': 'javbus.py',
+        'series': getSerise(htmlcode),
    }
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
    return js
@@ -143,6 +147,7 @@ def main(number):
                'actor_photo': getActorPhoto(htmlcode),
                'website': 'https://www.javbus.com/' + number,
                'source': 'javbus.py',
+                'series': getSerise(htmlcode),
            }
            js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,
                            separators=(',', ':'), )  # .encode('UTF-8')
@@ -157,3 +162,6 @@ def main(number):
            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
        )
        return js
+
+if __name__ == "__main__" :
+    print(main('ipx-292'))
--- a/javdb.py
+++ b/javdb.py
@@ -13,8 +13,8 @@ def getTitle(a):
    return result
 def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']")
+    result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
 def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
    a = actor.split(',')
@@ -25,23 +25,23 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
    return d
 def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
+    result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 def getRuntime(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']")
+    result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').rstrip('mi')
 def getLabel(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']")
+    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 def getNum(a):
    html = etree.fromstring(a, etree.HTMLParser())
-    result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']")
+    result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
    return str(result2 + result1).strip('+')
 def getYear(getRelease):
    try:
@@ -51,14 +51,18 @@ def getYear(getRelease):
        return getRelease
 def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']")
+    result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+')
 def getTag(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']")
-    return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',')
+    try:
+        result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
+        return result
+    except:
+        result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
+        return result
+
 def getCover_small(a, index=0):
    # same issue mentioned below,
    # javdb sometime returns multiple results
@@ -74,17 +78,26 @@ def getCover(htmlcode):
    return result
 def getDirector(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']")
-    result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']")
+    result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 def getOutline(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
    return result
+def getSeries(a):
+    #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
+    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
+    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 def main(number):
    try:
        number = number.upper()
-        query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
+        try:
+            query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
+        except:
+            query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all')
        html = etree.fromstring(query_result, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
        # javdb sometime returns multiple results,
        # and the first elememt maybe not the one we are looking for
@@ -111,6 +124,7 @@ def main(number):
            'actor_photo': getActorPhoto(getActor(detail_page)),
            'website': 'https://javdb.com' + correct_url,
            'source': 'javdb.py',
+            'series': getSeries(detail_page),
        }
    except Exception as e:
        # print(e)
@@ -120,4 +134,5 @@ def main(number):

 # main('DV-1562')
 # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
-#print(main('ipx-292'))
+if __name__ == "__main__":
+    print(main('ipx-292'))
--- a/mgstage.py
+++ b/mgstage.py
@@ -21,8 +21,8 @@ def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/t
    return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
 def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
-    result1=str(html.xpath('//th[contains(text(),"シリーズ：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
-    result2=str(html.xpath('//th[contains(text(),"シリーズ：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    result1=str(html.xpath('//th[contains(text(),"メーカー：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
+    result2=str(html.xpath('//th[contains(text(),"メーカー：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip('\\n')
    return str(result1+result2).strip('+').replace("', '",'').replace('"','')
 def getRuntime(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
@@ -55,14 +55,14 @@ def getRelease(a):
        '\\n')
    result2 = str(html.xpath('//th[contains(text(),"配信開始日：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
        '\\n')
-    return str(result1 + result2).strip('+')
+    return str(result1 + result2).strip('+').replace('/','-')
 def getTag(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
        '\\n')
    result2 = str(html.xpath('//th[contains(text(),"ジャンル：")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
        '\\n')
-    return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','')
+    return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
 def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
@@ -79,6 +79,13 @@ def getOutline(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
    return result
+def getSeries(a):
+    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
+        '\\n')
+    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 def main(number2):
    number=number2.upper()
    htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
@@ -103,6 +110,7 @@ def main(number2):
        'actor_photo': '',
        'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
        'source': 'mgstage.py',
+        'series': getSeries(a),
    }
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
    return js
--- a/xcity.py
+++ b/xcity.py
@@ -70,7 +70,7 @@ def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')).strip(" ['']")
    try:
-        return re.findall('\d{4}/\d{2}/\d{2}', result1)[0]
+        return re.findall('\d{4}/\d{2}/\d{2}', result1)[0].replace('/','-')
    except:
        return ''

@@ -122,8 +122,7 @@ def main(number):
    try:
        number = number.upper()
        query_result = get_html(
-            'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-',
-                                                                                                  '') + '&sg=main&num=30')
+            'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
        html = etree.fromstring(query_result, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
        urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
        detail_page = get_html('https://xcity.jp' + urls)