Update 3.7-5 DEBUG ONLY

This commit is contained in:
root
2020-08-14 17:00:31 +08:00
parent c5a68715ea
commit e687035722
14 changed files with 122 additions and 64 deletions

View File

@@ -63,31 +63,32 @@ def CEF(path):
a = '' a = ''
def create_data_and_move(file_path: str, c: config.Config): def create_data_and_move(file_path: str, c: config.Config,debug):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
n_number = get_number(file_path) n_number = get_number(file_path)
# print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number)) if debug == True:
# core_main(file_path, n_number, c)
# print("[*]======================================================")
try:
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number)) print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
core_main(file_path, n_number, c) core_main(file_path, n_number, c)
print("[*]======================================================") print("[*]======================================================")
except Exception as err: else:
print("[-] [{}] ERROR:".format(file_path)) try:
print('[-]', err) print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
core_main(file_path, n_number, c)
print("[*]======================================================")
except Exception as err:
print("[-] [{}] ERROR:".format(file_path))
print('[-]', err)
if c.soft_link(): if c.soft_link():
print("[-]Link {} to failed folder".format(file_path)) print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/") os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
else: else:
try: try:
print("[-]Move [{}] to failed folder".format(file_path)) print("[-]Move [{}] to failed folder".format(file_path))
shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/") shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
except Exception as err: except Exception as err:
print('[!]', err) print('[!]', err)
def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number=None): def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number=None):
try: try:
@@ -145,13 +146,15 @@ if __name__ == '__main__':
count = 0 count = 0
count_all = str(len(movie_list)) count_all = str(len(movie_list))
print('[+]Find', count_all, 'movies') print('[+]Find', count_all, 'movies')
if conf.debug() == True:
print('[+]'+' DEBUG MODE ON '.center(54, '-'))
if conf.soft_link(): if conf.soft_link():
print('[!] --- Soft link mode is ENABLE! ----') print('[!] --- Soft link mode is ENABLE! ----')
for movie_path in movie_list: # 遍历电影列表 交给core处理 for movie_path in movie_list: # 遍历电影列表 交给core处理
count = count + 1 count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%' percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -') print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
create_data_and_move(movie_path, conf) create_data_and_move(movie_path, conf, conf.debug())
CEF(conf.success_folder()) CEF(conf.success_folder())
CEF(conf.failed_folder()) CEF(conf.failed_folder())

0
WebCrawler/__init__.py Normal file
View File

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re import re
from lxml import etree from lxml import etree
import json import json

View File

@@ -2,6 +2,8 @@ import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import sys
sys.path.append('../')
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
@@ -24,7 +26,10 @@ def getTitle(a):
return result return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()') try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except:
result1 = ''
return result1 return result1
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',') a = actor.split(',')
@@ -35,7 +40,13 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
return d return d
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0] try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result return result
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
@@ -44,7 +55,13 @@ def getRuntime(a):
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0] try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result return result
def getYear(getRelease): def getYear(getRelease):
try: try:
@@ -54,12 +71,12 @@ def getYear(getRelease):
return getRelease return getRelease
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"販売")]/../td/a/text()')[0] result1 = html.xpath('//th[contains(text(),"贩卖")]/../td/a/text()')[0]
return result1.replace('','-').replace('','-').replace('','') return result1.replace('','-').replace('','-').replace('','')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()') result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result return result
except: except:
return '' return ''
@@ -85,7 +102,10 @@ def getCover(htmlcode):
return result return result
def getDirector(a): def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0] try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except:
result = ''
return result return result
def getOutline(htmlcode): def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -96,36 +116,52 @@ def getOutline(htmlcode):
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(a): def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()') try:
return result1 try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def main(number): def main(number):
number = number.upper() try:
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html') number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
cookies={'locale': 'zh-cn'})
dic = { dic = {
'actor': getActor(htmlcode), 'actor': getActor(htmlcode),
'title': getTitle(htmlcode), 'title': getTitle(htmlcode),
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'outline': getOutline(htmlcode), 'outline': getOutline(htmlcode),
'runtime': getRuntime(htmlcode), 'runtime': '',
'director': getDirector(htmlcode), 'director': getDirector(htmlcode),
'release': getRelease(htmlcode), 'release': getRelease(htmlcode),
'number': number, 'number': number,
'cover': 'https:' + getCover(htmlcode), 'cover': 'https:' + getCover(htmlcode),
'cover_small': '', 'cover_small': '',
'imagecut': 0, 'imagecut': 0,
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
'label': getLabel(htmlcode), 'label': getLabel(htmlcode),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '', 'actor_photo': '',
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html', 'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
'source': 'dlsite.py', 'source': 'dlsite.py',
'series': getSeries(htmlcode), 'series': getSeries(htmlcode),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
except:
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
# main('DV-1562') # main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__": if __name__ == "__main__":
print(main('VJ013479')) print(main('VJ013178'))

View File

@@ -1,5 +1,7 @@
#!/usr/bin/python3 #!/usr/bin/python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys
sys.path.append('../')
import json import json
import re import re
from urllib.parse import urlencode from urllib.parse import urlencode

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re import re
from lxml import etree#need install from lxml import etree#need install
import json import json

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import html from lxml import html

View File

@@ -1,10 +1,13 @@
import sys
sys.path.append('../')
import re import re
from pyquery import PyQuery as pq#need install from pyquery import PyQuery as pq#need install
from lxml import etree#need install from lxml import etree#need install
from bs4 import BeautifulSoup#need install from bs4 import BeautifulSoup#need install
import json import json
from ADC_function import * from ADC_function import *
import fanza from WebCrawler import fanza
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re import re
from lxml import etree from lxml import etree
import json import json

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import json import json
import bs4 import bs4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re import re
from lxml import etree from lxml import etree
import json import json

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re import re
from lxml import etree from lxml import etree
import json import json

View File

@@ -26,4 +26,4 @@ literals=\()/
folders=failed,JAV_output folders=failed,JAV_output
[debug_mode] [debug_mode]
switch=0 switch=1

20
core.py
View File

@@ -8,16 +8,16 @@ from PIL import Image
from ADC_function import * from ADC_function import *
# =========website======== # =========website========
import avsox from WebCrawler import avsox
import fanza from WebCrawler import fanza
import fc2fans_club from WebCrawler import fc2fans_club
import jav321 from WebCrawler import jav321
import javbus from WebCrawler import javbus
import javdb from WebCrawler import javdb
import mgstage from WebCrawler import mgstage
import xcity from WebCrawler import xcity
import javlib from WebCrawler import javlib
import dlsite from WebCrawler import dlsite
def escape_path(path, escape_literals: str): # Remove escape literals def escape_path(path, escape_literals: str): # Remove escape literals