Update 3.7-5 DEBUG ONLY

This commit is contained in:
root
2020-08-14 17:00:31 +08:00
parent c5a68715ea
commit e687035722
14 changed files with 122 additions and 64 deletions

View File

@@ -63,14 +63,15 @@ def CEF(path):
a = ''
def create_data_and_move(file_path: str, c: config.Config):
def create_data_and_move(file_path: str, c: config.Config,debug):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
n_number = get_number(file_path)
# print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
# core_main(file_path, n_number, c)
# print("[*]======================================================")
if debug == True:
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
core_main(file_path, n_number, c)
print("[*]======================================================")
else:
try:
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
core_main(file_path, n_number, c)
@@ -145,13 +146,15 @@ if __name__ == '__main__':
count = 0
count_all = str(len(movie_list))
print('[+]Find', count_all, 'movies')
if conf.debug() == True:
print('[+]'+' DEBUG MODE ON '.center(54, '-'))
if conf.soft_link():
print('[!] --- Soft link mode is ENABLE! ----')
for movie_path in movie_list: # 遍历电影列表 交给core处理
count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
create_data_and_move(movie_path, conf)
create_data_and_move(movie_path, conf, conf.debug())
CEF(conf.success_folder())
CEF(conf.failed_folder())

0
WebCrawler/__init__.py Normal file
View File

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json

View File

@@ -2,6 +2,8 @@ import re
from lxml import etree
import json
from bs4 import BeautifulSoup
import sys
sys.path.append('../')
from ADC_function import *
# import sys
# import io
@@ -24,7 +26,10 @@ def getTitle(a):
return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except:
result1 = ''
return result1
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',')
@@ -35,7 +40,13 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
@@ -44,7 +55,13 @@ def getRuntime(a):
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getYear(getRelease):
try:
@@ -54,12 +71,12 @@ def getYear(getRelease):
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"販売")]/../td/a/text()')[0]
result1 = html.xpath('//th[contains(text(),"贩卖")]/../td/a/text()')[0]
return result1.replace('','-').replace('','-').replace('','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()')
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result
except:
return ''
@@ -85,7 +102,10 @@ def getCover(htmlcode):
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0]
try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except:
result = ''
return result
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -96,18 +116,26 @@ def getOutline(htmlcode):
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
return result1
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def main(number):
try:
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html')
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
cookies={'locale': 'zh-cn'})
dic = {
'actor': getActor(htmlcode),
'title': getTitle(htmlcode),
'studio': getStudio(htmlcode),
'outline': getOutline(htmlcode),
'runtime': getRuntime(htmlcode),
'runtime': '',
'director': getDirector(htmlcode),
'release': getRelease(htmlcode),
'number': number,
@@ -124,8 +152,16 @@ def main(number):
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except:
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
print(main('VJ013479'))
print(main('VJ013178'))

View File

@@ -1,5 +1,7 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
sys.path.append('../')
import json
import re
from urllib.parse import urlencode

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree#need install
import json

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import json
from bs4 import BeautifulSoup
from lxml import html

View File

@@ -1,10 +1,13 @@
import sys
sys.path.append('../')
import re
from pyquery import PyQuery as pq#need install
from lxml import etree#need install
from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
import fanza
from WebCrawler import fanza
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml')

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import json
import bs4
from bs4 import BeautifulSoup

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json

View File

@@ -26,4 +26,4 @@ literals=\()/
folders=failed,JAV_output
[debug_mode]
switch=0
switch=1

20
core.py
View File

@@ -8,16 +8,16 @@ from PIL import Image
from ADC_function import *
# =========website========
import avsox
import fanza
import fc2fans_club
import jav321
import javbus
import javdb
import mgstage
import xcity
import javlib
import dlsite
from WebCrawler import avsox
from WebCrawler import fanza
from WebCrawler import fc2fans_club
from WebCrawler import jav321
from WebCrawler import javbus
from WebCrawler import javdb
from WebCrawler import mgstage
from WebCrawler import xcity
from WebCrawler import javlib
from WebCrawler import dlsite
def escape_path(path, escape_literals: str): # Remove escape literals