Update 3.7-5 DEBUG ONLY
This commit is contained in:
@@ -63,31 +63,32 @@ def CEF(path):
|
|||||||
a = ''
|
a = ''
|
||||||
|
|
||||||
|
|
||||||
def create_data_and_move(file_path: str, c: config.Config):
|
def create_data_and_move(file_path: str, c: config.Config,debug):
|
||||||
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
|
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
|
||||||
n_number = get_number(file_path)
|
n_number = get_number(file_path)
|
||||||
|
|
||||||
# print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
|
if debug == True:
|
||||||
# core_main(file_path, n_number, c)
|
|
||||||
# print("[*]======================================================")
|
|
||||||
|
|
||||||
try:
|
|
||||||
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
|
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
|
||||||
core_main(file_path, n_number, c)
|
core_main(file_path, n_number, c)
|
||||||
print("[*]======================================================")
|
print("[*]======================================================")
|
||||||
except Exception as err:
|
else:
|
||||||
print("[-] [{}] ERROR:".format(file_path))
|
try:
|
||||||
print('[-]', err)
|
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
|
||||||
|
core_main(file_path, n_number, c)
|
||||||
|
print("[*]======================================================")
|
||||||
|
except Exception as err:
|
||||||
|
print("[-] [{}] ERROR:".format(file_path))
|
||||||
|
print('[-]', err)
|
||||||
|
|
||||||
if c.soft_link():
|
if c.soft_link():
|
||||||
print("[-]Link {} to failed folder".format(file_path))
|
print("[-]Link {} to failed folder".format(file_path))
|
||||||
os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
|
os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
print("[-]Move [{}] to failed folder".format(file_path))
|
print("[-]Move [{}] to failed folder".format(file_path))
|
||||||
shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
|
shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print('[!]', err)
|
print('[!]', err)
|
||||||
|
|
||||||
def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number=None):
|
def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number=None):
|
||||||
try:
|
try:
|
||||||
@@ -145,13 +146,15 @@ if __name__ == '__main__':
|
|||||||
count = 0
|
count = 0
|
||||||
count_all = str(len(movie_list))
|
count_all = str(len(movie_list))
|
||||||
print('[+]Find', count_all, 'movies')
|
print('[+]Find', count_all, 'movies')
|
||||||
|
if conf.debug() == True:
|
||||||
|
print('[+]'+' DEBUG MODE ON '.center(54, '-'))
|
||||||
if conf.soft_link():
|
if conf.soft_link():
|
||||||
print('[!] --- Soft link mode is ENABLE! ----')
|
print('[!] --- Soft link mode is ENABLE! ----')
|
||||||
for movie_path in movie_list: # 遍历电影列表 交给core处理
|
for movie_path in movie_list: # 遍历电影列表 交给core处理
|
||||||
count = count + 1
|
count = count + 1
|
||||||
percentage = str(count / int(count_all) * 100)[:4] + '%'
|
percentage = str(count / int(count_all) * 100)[:4] + '%'
|
||||||
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
|
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
|
||||||
create_data_and_move(movie_path, conf)
|
create_data_and_move(movie_path, conf, conf.debug())
|
||||||
|
|
||||||
CEF(conf.success_folder())
|
CEF(conf.success_folder())
|
||||||
CEF(conf.failed_folder())
|
CEF(conf.failed_folder())
|
||||||
|
|||||||
0
WebCrawler/__init__.py
Normal file
0
WebCrawler/__init__.py
Normal file
@@ -1,3 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ import re
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
# import sys
|
# import sys
|
||||||
# import io
|
# import io
|
||||||
@@ -24,7 +26,10 @@ def getTitle(a):
|
|||||||
return result
|
return result
|
||||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
|
try:
|
||||||
|
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
|
||||||
|
except:
|
||||||
|
result1 = ''
|
||||||
return result1
|
return result1
|
||||||
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
||||||
a = actor.split(',')
|
a = actor.split(',')
|
||||||
@@ -35,7 +40,13 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
|||||||
return d
|
return d
|
||||||
def getStudio(a):
|
def getStudio(a):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
|
try:
|
||||||
|
try:
|
||||||
|
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
||||||
|
except:
|
||||||
|
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
|
||||||
|
except:
|
||||||
|
result = ''
|
||||||
return result
|
return result
|
||||||
def getRuntime(a):
|
def getRuntime(a):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
@@ -44,7 +55,13 @@ def getRuntime(a):
|
|||||||
return str(result1 + result2).strip('+').rstrip('mi')
|
return str(result1 + result2).strip('+').rstrip('mi')
|
||||||
def getLabel(a):
|
def getLabel(a):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
|
try:
|
||||||
|
try:
|
||||||
|
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
||||||
|
except:
|
||||||
|
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
|
||||||
|
except:
|
||||||
|
result = ''
|
||||||
return result
|
return result
|
||||||
def getYear(getRelease):
|
def getYear(getRelease):
|
||||||
try:
|
try:
|
||||||
@@ -54,12 +71,12 @@ def getYear(getRelease):
|
|||||||
return getRelease
|
return getRelease
|
||||||
def getRelease(a):
|
def getRelease(a):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0]
|
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
|
||||||
return result1.replace('年','-').replace('月','-').replace('日','')
|
return result1.replace('年','-').replace('月','-').replace('日','')
|
||||||
def getTag(a):
|
def getTag(a):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
try:
|
try:
|
||||||
result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()')
|
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
|
||||||
return result
|
return result
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
@@ -85,7 +102,10 @@ def getCover(htmlcode):
|
|||||||
return result
|
return result
|
||||||
def getDirector(a):
|
def getDirector(a):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0]
|
try:
|
||||||
|
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
|
||||||
|
except:
|
||||||
|
result = ''
|
||||||
return result
|
return result
|
||||||
def getOutline(htmlcode):
|
def getOutline(htmlcode):
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||||
@@ -96,36 +116,52 @@ def getOutline(htmlcode):
|
|||||||
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
||||||
def getSeries(a):
|
def getSeries(a):
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
|
try:
|
||||||
return result1
|
try:
|
||||||
|
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
||||||
|
except:
|
||||||
|
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
|
||||||
|
except:
|
||||||
|
result = ''
|
||||||
|
return result
|
||||||
def main(number):
|
def main(number):
|
||||||
number = number.upper()
|
try:
|
||||||
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html')
|
number = number.upper()
|
||||||
|
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
|
||||||
|
cookies={'locale': 'zh-cn'})
|
||||||
|
|
||||||
dic = {
|
dic = {
|
||||||
'actor': getActor(htmlcode),
|
'actor': getActor(htmlcode),
|
||||||
'title': getTitle(htmlcode),
|
'title': getTitle(htmlcode),
|
||||||
'studio': getStudio(htmlcode),
|
'studio': getStudio(htmlcode),
|
||||||
'outline': getOutline(htmlcode),
|
'outline': getOutline(htmlcode),
|
||||||
'runtime': getRuntime(htmlcode),
|
'runtime': '',
|
||||||
'director': getDirector(htmlcode),
|
'director': getDirector(htmlcode),
|
||||||
'release': getRelease(htmlcode),
|
'release': getRelease(htmlcode),
|
||||||
'number': number,
|
'number': number,
|
||||||
'cover': 'https:' + getCover(htmlcode),
|
'cover': 'https:' + getCover(htmlcode),
|
||||||
'cover_small': '',
|
'cover_small': '',
|
||||||
'imagecut': 0,
|
'imagecut': 0,
|
||||||
'tag': getTag(htmlcode),
|
'tag': getTag(htmlcode),
|
||||||
'label': getLabel(htmlcode),
|
'label': getLabel(htmlcode),
|
||||||
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
|
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||||
'actor_photo': '',
|
'actor_photo': '',
|
||||||
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
|
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
|
||||||
'source': 'dlsite.py',
|
'source': 'dlsite.py',
|
||||||
'series': getSeries(htmlcode),
|
'series': getSeries(htmlcode),
|
||||||
}
|
}
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||||
return js
|
return js
|
||||||
|
except:
|
||||||
|
data = {
|
||||||
|
"title": "",
|
||||||
|
}
|
||||||
|
js = json.dumps(
|
||||||
|
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
||||||
|
)
|
||||||
|
return js
|
||||||
|
|
||||||
# main('DV-1562')
|
# main('DV-1562')
|
||||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(main('VJ013479'))
|
print(main('VJ013178'))
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import re
|
import re
|
||||||
from lxml import etree#need install
|
from lxml import etree#need install
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import re
|
import re
|
||||||
from pyquery import PyQuery as pq#need install
|
from pyquery import PyQuery as pq#need install
|
||||||
from lxml import etree#need install
|
from lxml import etree#need install
|
||||||
from bs4 import BeautifulSoup#need install
|
from bs4 import BeautifulSoup#need install
|
||||||
import json
|
import json
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
import fanza
|
from WebCrawler import fanza
|
||||||
|
|
||||||
|
|
||||||
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
||||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
soup = BeautifulSoup(htmlcode, 'lxml')
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import json
|
import json
|
||||||
import bs4
|
import bs4
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -26,4 +26,4 @@ literals=\()/
|
|||||||
folders=failed,JAV_output
|
folders=failed,JAV_output
|
||||||
|
|
||||||
[debug_mode]
|
[debug_mode]
|
||||||
switch=0
|
switch=1
|
||||||
20
core.py
20
core.py
@@ -8,16 +8,16 @@ from PIL import Image
|
|||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
|
|
||||||
# =========website========
|
# =========website========
|
||||||
import avsox
|
from WebCrawler import avsox
|
||||||
import fanza
|
from WebCrawler import fanza
|
||||||
import fc2fans_club
|
from WebCrawler import fc2fans_club
|
||||||
import jav321
|
from WebCrawler import jav321
|
||||||
import javbus
|
from WebCrawler import javbus
|
||||||
import javdb
|
from WebCrawler import javdb
|
||||||
import mgstage
|
from WebCrawler import mgstage
|
||||||
import xcity
|
from WebCrawler import xcity
|
||||||
import javlib
|
from WebCrawler import javlib
|
||||||
import dlsite
|
from WebCrawler import dlsite
|
||||||
|
|
||||||
|
|
||||||
def escape_path(path, escape_literals: str): # Remove escape literals
|
def escape_path(path, escape_literals: str): # Remove escape literals
|
||||||
|
|||||||
Reference in New Issue
Block a user