@@ -9,7 +9,8 @@ from bs4 import BeautifulSoup#need install
from PIL import Image #need install
import time
def get_html ( url ) :
#=====================爬虫核心部分==========================
def get_html ( url ) : #网页请求核心
headers = { ' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 ' }
getweb = requests . get ( str ( url ) , proxies = { " http " : " http://127.0.0.1:2334 " , " https " : " https://127.0.0.1:2334 " } , timeout = 5 , headers = headers ) . text
try :
@@ -18,65 +19,64 @@ def get_html(url):
print ( e )
except IOError as e1 :
print ( e1 )
#================================================
def getTitle ( htmlcode ) :
def getTitle ( htmlcode ) : #获取标题
doc = pq ( htmlcode )
title = str ( doc ( ' div.container h3 ' ) . text ( ) ) . replace ( ' ' , ' - ' )
return title
def getStudio ( htmlcode ) :
def getStudio ( htmlcode ) : #获取厂商
html = etree . fromstring ( htmlcode , etree . HTMLParser ( ) )
result = str ( html . xpath ( ' /html/body/div[5]/div[1]/div[2]/p[5]/a/text() ' ) ) . strip ( " [ ' ' ] " )
return result
def getYear ( htmlcode ) :
def getYear ( htmlcode ) : #获取年份
html = etree . fromstring ( htmlcode , etree . HTMLParser ( ) )
result = str ( html . xpath ( ' /html/body/div[5]/div[1]/div[2]/p[2]/text() ' ) ) . strip ( " [ ' ' ] " )
result2 = str ( re . search ( ' \ d {4} ' , result ) . group ( 0 ) )
return result2
def getCover ( htmlcode ) :
def getCover ( htmlcode ) : #获取封面链接
doc = pq ( htmlcode )
image = doc ( ' a.bigImage ' )
return image . attr ( ' href ' )
print ( image . attr ( ' href ' ) )
def getRelease ( htmlcode ) :
def getRelease ( htmlcode ) : #获取出版日期
html = etree . fromstring ( htmlcode , etree . HTMLParser ( ) )
result = str ( html . xpath ( ' /html/body/div[5]/div[1]/div[2]/p[2]/text() ' ) ) . strip ( " [ ' ' ] " )
return result
def getRuntime ( htmlcode ) :
def getRuntime ( htmlcode ) : #获取分钟
soup = BeautifulSoup ( htmlcode , ' lxml ' )
a = soup . find ( text = re . compile ( ' 分鐘 ' ) )
return a
def getActor ( htmlcode ) :
def getActor ( htmlcode ) : #获取女优
b = [ ]
soup = BeautifulSoup ( htmlcode , ' lxml ' )
a = soup . find_all ( attrs = { ' class ' : ' star-name ' } )
for i in a :
b . append ( i . text )
return " , " . join ( b )
def getNum ( htmlcode ) :
def getNum ( htmlcode ) : #获取番号
html = etree . fromstring ( htmlcode , etree . HTMLParser ( ) )
result = str ( html . xpath ( ' /html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text() ' ) ) . strip ( " [ ' ' ] " )
return result
def getDirector ( htmlcode ) :
def getDirector ( htmlcode ) : #获取导演
html = etree . fromstring ( htmlcode , etree . HTMLParser ( ) )
result = str ( html . xpath ( ' /html/body/div[5]/div[1]/div[2]/p[4]/a/text() ' ) ) . strip ( " [ ' ' ] " )
return result
def getOutline ( htmlcode ) :
def getOutline ( htmlcode ) : #获取演员
doc = pq ( htmlcode )
result = str ( doc ( ' tr td div.mg-b20.lh4 p.mg-b20 ' ) . text ( ) )
return result
#================================================
#=====================资源下载部分 ===========================
def DownloadFileWithFilename ( url , filename , path ) : #path = examle:photo , video.in the Project Folder!
import requests
import re
try :
if not os . path . exists ( path ) :
os . makedirs ( path )
r = requests . get ( url )
with open ( str ( path ) + " / " + str ( filename ) , " wb " ) as code :
code . write ( r . content )
# print('[+]Downloaded!',str(path) + "/"+str(filename))
except IOError as e :
print ( " [-]Download Failed1! " )
print ( " [-]Error: " + str ( e ) )
print ( " [-]Movie not found in Javbus.com! " )
print ( " [*]===================================== " )
return " failed "
@@ -85,8 +85,7 @@ def DownloadFileWithFilename(url,filename,path): #path = examle:photo , video.in
print ( " [-]Download Failed2! " )
time . sleep ( 3 )
os . _exit ( 0 )
def PrintFiles ( path ) :
def PrintFiles ( html , path ) :
try :
if not os . path . exists ( path ) :
os . makedirs ( path )
@@ -97,8 +96,8 @@ def PrintFiles(path):
print ( " </set> " , file = code )
print ( " <studio> " + getStudio ( html ) + " +</studio> " , file = code )
print ( " <year> " + getYear ( html ) + " </year> " , file = code )
print ( " <outline> " + getOutline ( html_outline ) + " </outline> " , file = code )
print ( " <plot> " + getOutline ( html_outline ) + " </plot> " , file = code )
print ( " <outline> " + getOutline ( html ) + " </outline> " , file = code )
print ( " <plot> " + getOutline ( html ) + " </plot> " , file = code )
print ( " <runtime> " + str ( getRuntime ( html ) ) . replace ( " " , " " ) + " </runtime> " , file = code )
print ( " <director> " + getDirector ( html ) + " </director> " , file = code )
print ( " <poster> " + getNum ( html ) + " .png</poster> " , file = code )
@@ -122,81 +121,81 @@ def PrintFiles(path):
except Exception as e1 :
print ( e1 )
print ( " [-]Write Failed! " )
#================================================
if __name__ == ' __main__ ' :
#命令行处理
#=====================本地文件处理===========================
def argparse_get_file ( ) :
import argparse
parser = argparse . ArgumentParser ( )
parser . add_argument ( " file " , help = " Write the file path on here " )
args = parser . parse_args ( )
#===============================
#获取文件名称
filename = str ( os . path . basename ( args . file ) ) #\[\d{4}(\-|\/|.)\d{1,2}\1\d{1,2}\]
#去除文件名中日期
deldate = str ( re . sub ( " \ [ \ d {4} - \ d { 1,2}- \ d { 1,2} \ ] - " , " " , filename ) )
return args . file
def getNumberFromFilename ( filepath ) :
filename = str ( os . path . basename ( filepath ) ) #电影文件名
str ( re . sub ( " \ [ \ d {4} - \ d { 1,2}- \ d { 1,2} \ ] - " , " " , filename ) )
#检测是否可以获取番号,如果不行终止本程序
def number_getter ( ) :
print ( " [!]Making Data for [ " + filename + " ] " )
try :
a = str ( re . search ( ' \ w+- \ w+ ' , deldat e) . group ( ) )
a = str ( re . search ( ' \ w+- \ w+ ' , filenam e) . group ( ) )
return a
except :
print ( ' [-]File ' + filename + ' `s number can not be caught ' )
print ( ' [-]Move ' + filename + ' to failed folder ' )
filepath = str ( args ) . replace ( " Namespace(file= ' " , ' ' ) . replace ( " ' ) " , ' ' ) . replace ( ' \\ \\ ' , ' \\ ' )
if not os . path . exists ( ' failed/ ' ) : # 新建failed文件夹
os . makedirs ( ' failed/ ' )
if not os . path . exists ( ' failed/ ' ) :
print ( " [-]failed!Dirs can not be make (Please run as Administrator) " )
time . sleep ( 3 )
os . _exit ( 0 )
shutil . move ( filepath , str ( os . getcwd ( ) ) + ' /failed/ ' )
shutil . move ( filepath , str ( os . getcwd ( ) ) + ' /' + ' failed/ ' )
os . _exit ( 0 )
number = number_getter ( )
#print( number)
#获取网页HTML
html = get_html ( " https://www.javbus.com/ " + str ( number ) )
html_outline = get_html ( " https://www.dmm.co.jp/mono/dvd/-/detail/=/cid= " + number . replace ( " - " , ' ' ) )
#处理超长文件夹名称
if len ( getActor ( html ) ) > 240 :
path = ' JAV_output ' + ' / ' + ' 超多人 ' + ' / ' + getNum ( html ) #path为影片+元数据所在目录
def get_html_javbus ( number ) :
return get_html ( " https://www.javbus.com/ " + str ( number) )
def get_html_dww ( number ) :
return get_html ( " https://www.dmm.co.jp/mono/dvd/-/detail/=/cid= " + number . replace ( " - " , ' ' ) )
path = ' ' #设置path为全局变量, 后面移动文件要用
def creatFolder ( html , number ) :
global path
if not os . path . exists ( ' failed/ ' ) : #新建failed文件夹
os . makedirs ( ' failed/ ' )
if not os . path . exists ( ' failed/ ' ) :
print ( " [-]failed!Dirs can not be make (Please run as Administrator) " )
os . _exit ( 0 )
if len ( getActor ( html ) ) > 240 : #新建成功输出文件夹
path = ' JAV_output ' + ' / ' + ' 超多人 ' + ' / ' + number #path为影片+元数据所在目录
else :
path = ' JAV_output ' + ' / ' + getActor ( html ) + ' / ' + getNum ( html )
path = ' JAV_output ' + ' / ' + getActor ( html ) + ' / ' + number
if not os . path . exists ( path ) :
os . makedirs ( path )
#文件路径处理
filepath = str ( args ) . replace ( " Namespace(file= ' " , ' ' ) . replace ( " ' ) " , ' ' ) . replace ( ' \\ \\ ' , ' \\ ' )
houzhui = str ( re . search ( ' [.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|avi|rmvb|wmv|mov|mp4|mkv|flv)$ ' , file path) . group ( ) )
#下载元数据
#如果DownloadFileWithFilename返回为failed, 就退出本程序
if not os . path . exists ( ' failed/ ' ) : #新建failed文件夹
os . makedirs ( ' failed/ ' )
if not os . path . exists ( ' failed/ ' ) :
print ( " [-]failed!Dirs can not be make (Please run as Administrator) " )
os . _exit ( 0 )
if DownloadFileWithFilename ( getCover ( html ) , getNum ( html ) + ' .jpg ' , path ) == ' failed ' :
shutil . move ( filepath , ' failed/ ' )
time . sleep ( 3 )
os . _exit ( 0 )
else :
if DownloadFileWithFilename ( getCover ( html ) , getNum ( html ) + ' .jpg ' , path ) == ' failed ' :
path = str ( os . getcwd ( ) ) + ' / ' + path
def imageDownload ( htmlcode , filepath , number ) : #封面是否下载成功, 否则移动到failed
if DownloadFileWithFilename ( getCover ( htmlcode ) , number + ' .jpg ' , path ) == ' failed ' :
shutil . move ( filepath , ' failed/ ' )
os . _exit ( 0 )
DownloadFileWithFilename ( getCover ( html ) , getNum ( html ) + ' .jpg ' , path )
print ( ' [+]Downloaded! ' , path + ' / ' + getNum ( html ) + ' .jpg ' )
#切割图片做封面
DownloadFileWithFilename ( getCover ( htmlcode ) , number + ' .jpg ' , path )
print ( ' [+]Image Downloaded! ' , path + ' / ' + number + ' .jpg ' )
def cutImage ( number ) :
try :
img = Image . open ( path + ' / ' + getNum ( html ) + ' .jpg ' )
img2 = img . crop ( ( 421 , 0 , 800 , 538 ) )
img2 . save ( path + ' / ' + getNum ( html ) + ' .png ' )
img = Image . open ( path + ' / ' + number + ' .jpg ' )
imgSize = img . size
w = img. width
h = img . height
img2 = img . crop ( ( w / 1.9 , 0 , w , h ) )
img2 . save ( path + ' / ' + number + ' .png ' )
except :
print ( ' [-]Cover cut failed! ' )
# 电源文件位置处理
def pasteFileToFolder ( filepath , number , path ) : #文件路径,番号,后缀,要移动至的位置
houzhui = str ( re . search ( ' [.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|avi|rmvb|wmv|mov|mp4|mkv|flv)$ ' , filepath ) . group ( ) )
os . rename ( filepath , number + houzhui )
shutil . move ( number + houzhui , path )
#下载元数据
PrintFiles ( path )
print ( ' [!]Finished! ' )
time . sleep ( 3 )
if __name__ == ' __main__ ' :
filepath = argparse_get_file ( ) #影片的路径
number = getNumberFromFilename ( filepath ) #定义番号
htmlcode = get_html_javbus ( number ) #获取的HTML代码
creatFolder ( htmlcode , number ) #创建文件夹
imageDownload ( htmlcode , filepath , number ) #creatFoder会返回番号路径
PrintFiles ( htmlcode , path ) #打印文件
cutImage ( number ) #裁剪图
pasteFileToFolder ( filepath , number , path ) #移动文件