Update 3.7-5 DEBUG ONLY

2020-08-14 17:00:31 +08:00
parent c5a68715ea
commit e687035722
14 changed files with 122 additions and 64 deletions
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -63,14 +63,15 @@ def CEF(path):
        a = ''


-def create_data_and_move(file_path: str, c: config.Config):
+def create_data_and_move(file_path: str, c: config.Config,debug):
    # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
    n_number = get_number(file_path)

-    # print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
-    # core_main(file_path, n_number, c)
-    # print("[*]======================================================")
-
+    if debug == True:
+        print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
+        core_main(file_path, n_number, c)
+        print("[*]======================================================")
+    else:
        try:
            print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
            core_main(file_path, n_number, c)
@@ -145,13 +146,15 @@ if __name__ == '__main__':
    count = 0
    count_all = str(len(movie_list))
    print('[+]Find', count_all, 'movies')
+    if conf.debug() == True:
+        print('[+]'+' DEBUG MODE ON '.center(54, '-'))
    if conf.soft_link():
        print('[!] --- Soft link mode is ENABLE! ----')
    for movie_path in movie_list:  # 遍历电影列表 交给core处理
        count = count + 1
        percentage = str(count / int(count_all) * 100)[:4] + '%'
        print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
-        create_data_and_move(movie_path, conf)
+        create_data_and_move(movie_path, conf, conf.debug())

    CEF(conf.success_folder())
    CEF(conf.failed_folder())
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 import re
 from lxml import etree
 import json
--- a/WebCrawler/dlsite.py
+++ b/WebCrawler/dlsite.py
@@ -2,6 +2,8 @@ import re
 from lxml import etree
 import json
 from bs4 import BeautifulSoup
+import sys
+sys.path.append('../')
 from ADC_function import *
 # import sys
 # import io
@@ -24,7 +26,10 @@ def getTitle(a):
    return result
 def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
+    try:
+        result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
+    except:
+        result1 = ''
    return result1
 def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
    a = actor.split(',')
@@ -35,7 +40,13 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
    return d
 def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
+    try:
+        try:
+            result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
+        except:
+            result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
+    except:
+        result = ''
    return result
 def getRuntime(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
@@ -44,7 +55,13 @@ def getRuntime(a):
    return str(result1 + result2).strip('+').rstrip('mi')
 def getLabel(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
+    try:
+        try:
+            result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
+        except:
+            result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
+    except:
+        result = ''
    return result
 def getYear(getRelease):
    try:
@@ -54,12 +71,12 @@ def getYear(getRelease):
        return getRelease
 def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0]
+    result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
    return result1.replace('年','-').replace('月','-').replace('日','')
 def getTag(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
-        result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()')
+        result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
        return result
    except:
        return ''
@@ -85,7 +102,10 @@ def getCover(htmlcode):
    return result
 def getDirector(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0]
+    try:
+        result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
+    except:
+        result = ''
    return result
 def getOutline(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -96,18 +116,26 @@ def getOutline(htmlcode):
    return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
 def getSeries(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
-    return result1
+    try:
+        try:
+            result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
+        except:
+            result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
+    except:
+        result = ''
+    return result
 def main(number):
+    try:
        number = number.upper()
-    htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html')
+        htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
+                            cookies={'locale': 'zh-cn'})

        dic = {
            'actor': getActor(htmlcode),
            'title': getTitle(htmlcode),
            'studio': getStudio(htmlcode),
            'outline': getOutline(htmlcode),
-        'runtime': getRuntime(htmlcode),
+            'runtime': '',
            'director': getDirector(htmlcode),
            'release': getRelease(htmlcode),
            'number': number,
@@ -124,8 +152,16 @@ def main(number):
        }
        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
        return js
+    except:
+        data = {
+            "title": "",
+        }
+        js = json.dumps(
+            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+        )
+        return js

 # main('DV-1562')
 # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
 if __name__ == "__main__":
-    print(main('VJ013479'))
+    print(main('VJ013178'))
--- a/WebCrawler/fanza.py
+++ b/WebCrawler/fanza.py
@@ -1,5 +1,7 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
+import sys
+sys.path.append('../')
 import json
 import re
 from urllib.parse import urlencode
--- a/WebCrawler/fc2fans_club.py
+++ b/WebCrawler/fc2fans_club.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 import re
 from lxml import etree#need install
 import json
--- a/WebCrawler/jav321.py
+++ b/WebCrawler/jav321.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 import json
 from bs4 import BeautifulSoup
 from lxml import html
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -1,10 +1,13 @@
+import sys
+sys.path.append('../')
 import re
 from pyquery import PyQuery as pq#need install
 from lxml import etree#need install
 from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
-import fanza
+from WebCrawler import fanza
+

 def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
    soup = BeautifulSoup(htmlcode, 'lxml')
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 import re
 from lxml import etree
 import json
--- a/WebCrawler/javlib.py
+++ b/WebCrawler/javlib.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 import json
 import bs4
 from bs4 import BeautifulSoup
--- a/WebCrawler/mgstage.py
+++ b/WebCrawler/mgstage.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 import re
 from lxml import etree
 import json
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('../')
 import re
 from lxml import etree
 import json
--- a/config.ini
+++ b/config.ini
@@ -26,4 +26,4 @@ literals=\()/
 folders=failed,JAV_output

 [debug_mode]
-switch=0
+switch=1
--- a/core.py
+++ b/core.py
@@ -8,16 +8,16 @@ from PIL import Image
 from ADC_function import *

 # =========website========
-import avsox
-import fanza
-import fc2fans_club
-import jav321
-import javbus
-import javdb
-import mgstage
-import xcity
-import javlib
-import dlsite
+from WebCrawler import avsox
+from WebCrawler import fanza
+from WebCrawler import fc2fans_club
+from WebCrawler import jav321
+from WebCrawler import javbus
+from WebCrawler import javdb
+from WebCrawler import mgstage
+from WebCrawler import xcity
+from WebCrawler import javlib
+from WebCrawler import dlsite


 def escape_path(path, escape_literals: str):  # Remove escape literals