From 764fba74ec9c40e8d24b91c9bfc5908246f0e346 Mon Sep 17 00:00:00 2001 From: wenead99 <42309414+wenead99@users.noreply.github.com> Date: Wed, 19 Jun 2019 18:19:34 +0800 Subject: [PATCH] Beta 10.2 Update --- ADC_function.py | 17 +++++++---- AV_Data_Capture.py | 6 ++-- core.py | 2 +- fc2fans_club.py | 4 +-- javbus.py | 35 +++++++++++------------ siro.py | 71 +++++++++++++++++++++++++--------------------- 6 files changed, 75 insertions(+), 60 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index e356b5b..0a3b4ed 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -1,8 +1,15 @@ import requests -from configparser import ConfigParser +from configparser import RawConfigParser import os +import re -config = ConfigParser() +# content = open('proxy.ini').read() +# content = re.sub(r"\xfe\xff","", content) +# content = re.sub(r"\xff\xfe","", content) +# content = re.sub(r"\xef\xbb\xbf","", content) +# open('BaseConfig.cfg', 'w').write(content) + +config = RawConfigParser() if os.path.exists('proxy.ini'): config.read('proxy.ini', encoding='UTF-8') else: @@ -10,14 +17,14 @@ else: print("[proxy]",file=code) print("proxy=127.0.0.1:1080",file=code) -def get_html(url):#网页请求核心 +def get_html(url,cookies = None):#网页请求核心 if not str(config['proxy']['proxy']) == '': proxies = { "http" : "http://" + str(config['proxy']['proxy']), "https": "https://" + str(config['proxy']['proxy']) } headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'} - getweb = requests.get(str(url), headers=headers, proxies=proxies) + getweb = requests.get(str(url), headers=headers, proxies=proxies,cookies=cookies) getweb.encoding = 'utf-8' # print(getweb.text) try: @@ -27,7 +34,7 @@ def get_html(url):#网页请求核心 else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - getweb = requests.get(str(url), headers=headers) + getweb = requests.get(str(url), headers=headers,cookies=cookies) getweb.encoding = 'utf-8' try: return getweb.text diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 9782d27..3e4d7b3 100644 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -19,8 +19,10 @@ def movie_lists(): f2 = glob.glob(os.getcwd() + r"\*.mkv") # FLV g2 = glob.glob(os.getcwd() + r"\*.flv") + # TS + h2 = glob.glob(os.getcwd() + r"\*.ts") - total = a2+b2+c2+d2+e2+f2+g2 + total = a2+b2+c2+d2+e2+f2+g2+h2 return total def lists_from_test(custom_nuber): #电影列表 @@ -58,4 +60,4 @@ if __name__ =='__main__': print("[!]Cleaning empty folders") CEF('JAV_output') print("[+]All finished!!!") - input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看错误信息。") + input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看错误信息。") \ No newline at end of file diff --git a/core.py b/core.py index 0f20df3..ed5e9a2 100644 --- a/core.py +++ b/core.py @@ -299,7 +299,7 @@ def cutImage(): h = img.height img.save(path + '/' + naming_rule + '.png') def pasteFileToFolder(filepath, path): #文件路径,番号,后缀,要移动至的位置 - houzhui = str(re.search('[.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|avi|rmvb|wmv|mov|mp4|mkv|flv)$', filepath).group()) + houzhui = str(re.search('[.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|TS|avi|rmvb|wmv|mov|mp4|mkv|flv|ts)$', filepath).group()) os.rename(filepath, naming_rule + houzhui) shutil.move(naming_rule + houzhui, path) diff --git a/fc2fans_club.py b/fc2fans_club.py index a74ecb9..11272e3 100644 --- a/fc2fans_club.py +++ b/fc2fans_club.py @@ -38,8 +38,8 @@ def getOutline(htmlcode,number): #获取番号 # result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[4]/p/text()')).replace("\\n",'',10000).strip(" ['']").replace("'",'',10000) # return result -def main(number): - str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") +def main(number2): + number=number2.replace('PPV','').replace('ppv','') htmlcode = ADC_function.get_html('http://fc2fans.club/html/FC2-' + number + '.html') dic = { 'title': getTitle(htmlcode), diff --git a/javbus.py b/javbus.py index da62764..e5e1712 100644 --- a/javbus.py +++ b/javbus.py @@ -9,14 +9,7 @@ from bs4 import BeautifulSoup#need install from PIL import Image#need install import time import json - -def get_html(url):#网页请求核心 - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - getweb = requests.get(str(url),timeout=10,headers=headers).text - try: - return getweb - except: - print("[-]Connect Failed! Please check your Proxy.") +from ADC_function import * def getTitle(htmlcode): #获取标题 doc = pq(htmlcode) @@ -34,7 +27,6 @@ def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') return image.attr('href') - print(image.attr('href')) def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") @@ -62,8 +54,10 @@ def getOutline(htmlcode): #获取演员 doc = pq(htmlcode) result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) return result - - +def getSerise(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") + return result def getTag(htmlcode): # 获取演员 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') @@ -79,7 +73,7 @@ def main(number): htmlcode=get_html('https://www.javbus.com/'+number) dww_htmlcode=get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dic = { - 'title': getTitle(htmlcode), + 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))), 'studio': getStudio(htmlcode), 'year': str(re.search('\d{4}',getYear(htmlcode)).group()), 'outline': getOutline(dww_htmlcode), @@ -90,7 +84,8 @@ def main(number): 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'imagecut': 1, - 'tag': getTag(htmlcode) + 'tag': getTag(htmlcode), + 'label': getSerise(htmlcode), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') @@ -98,7 +93,7 @@ def main(number): htmlcode = get_html('https://www.javbus.com/' + number) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dic = { - 'title': getTitle(htmlcode), + 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))), 'studio': getStudio(htmlcode), 'year': getYear(htmlcode), 'outline': getOutline(dww_htmlcode), @@ -109,7 +104,8 @@ def main(number): 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'imagecut': 1, - 'tag': getTag(htmlcode) + 'tag': getTag(htmlcode), + 'label': getSerise(htmlcode), } js2 = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js2 @@ -118,11 +114,12 @@ def main(number): def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/' + number) + dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dic = { - 'title': getTitle(htmlcode), + 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))), 'studio': getStudio(htmlcode), 'year': getYear(htmlcode), - 'outline': getOutline(htmlcode), + 'outline': getOutline(dww_htmlcode), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), @@ -130,6 +127,7 @@ def main_uncensored(number): 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'tag': getTag(htmlcode), + 'label': getSerise(htmlcode), 'imagecut': 0, } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -138,7 +136,7 @@ def main_uncensored(number): number2 = number.replace('-', '_') htmlcode = get_html('https://www.javbus.com/' + number2) dic2 = { - 'title': getTitle(htmlcode), + 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))), 'studio': getStudio(htmlcode), 'year': getYear(htmlcode), 'outline': '', @@ -149,6 +147,7 @@ def main_uncensored(number): 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'tag': getTag(htmlcode), + 'label':getSerise(htmlcode), 'imagecut': 0, } js2 = json.dumps(dic2, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') diff --git a/siro.py b/siro.py index f7359ee..aad875e 100644 --- a/siro.py +++ b/siro.py @@ -3,70 +3,74 @@ from lxml import etree import json import requests from bs4 import BeautifulSoup - -def get_html(url):#网页请求核心 - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - cookies = {'adc':'1'} - getweb = requests.get(str(url),timeout=10,cookies=cookies,headers=headers).text - try: - return getweb - except: - print("[-]Connect Failed! Please check your Proxy.") +from ADC_function import * def getTitle(a): html = etree.fromstring(a, etree.HTMLParser()) result = str(html.xpath('//*[@id="center_column"]/div[2]/h1/text()')).strip(" ['']") return result def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(a, etree.HTMLParser()) - result=str(html.xpath('//table[2]/tr[1]/td/a/text()')).strip(" ['\\n ']") - return result + html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() + result2=str(html.xpath('//table/tr[1]/td[1]/text()')).strip(" ['\\n ']") + result1 = str(html.xpath('//table/tr[1]/td[1]/a/text()')).strip(" ['\\n ']") + return str(result1+result2).strip('+') def getStudio(a): html = etree.fromstring(a, etree.HTMLParser()) - result=str(html.xpath('//table[2]/tr[2]/td/a/text()')).strip(" ['\\n ']") - return result + result2=str(html.xpath('//table[2]/tr[2]/td/text()')).strip(" ['\\n ']") + result1 = str(html.xpath('//table/tr[2]/td[1]/a/text()')).strip(" ['\\n ']") + return str(result1+result2).strip('+') def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) - result=str(html.xpath('//table[2]/tr[3]/td/text()')).strip(" ['\\n ']") - return result + result2=str(html.xpath('//table/tr[3]/td[1]/text()')).strip(" ['\\n ']") + result1 = str(html.xpath('//table/tr[3]/td[1]/a/text()')).strip(" ['\\n ']") + return str(result1 + result2).strip('+').strip('mi') +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) + result2=str(html.xpath('//table/tr[6]/td[1]/text()')).strip(" ['\\n ']") + result1 = str(html.xpath('//table/tr[6]/td[1]/a/text()')).strip(" ['\\n ']") + return str(result1 + result2).strip('+') def getNum(a): html = etree.fromstring(a, etree.HTMLParser()) - result=str(html.xpath('//table[2]/tr[4]/td/text()')).strip(" ['\\n ']") - return result + result2=str(html.xpath('//table/tr[2]/td[4]/a/text()')).strip(" ['\\n ']") + result1 = str(html.xpath('//table/tr[2]/td[4]/text()')).strip(" ['\\n ']") + return str(result1 + result2).strip('+') def getYear(a): html = etree.fromstring(a, etree.HTMLParser()) - #result=str(html.xpath('//table[2]/tr[5]/td/text()')).strip(" ['\\n ']") - result=str(html.xpath('//table[2]/tr[5]/td/text()')).strip(" ['\\n ']") - return result + result2=str(html.xpath('//table/tr[2]/td[5]/a/text()')).strip(" ['\\n ']") + result1=str(html.xpath('//table/tr[2]/td[5]/text()')).strip(" ['\\n ']") + return result2+result1 def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) - result=str(html.xpath('//table[2]/tr[5]/td/text()')).strip(" ['\\n ']") - return result + result2=str(html.xpath('//table/tr[5]/td[1]/text()')).strip(" ['\\n ']") + result1 = str(html.xpath('//table/tr[5]/a/td[1]/text()')).strip(" ['\\n ']") + return str(result1 + result2).strip('+') def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) - result=str(html.xpath('//table[2]/tr[9]/td/text()')).strip(" ['\\n ']") - return result + result2=str(html.xpath('//table/tr[8]/td[1]/a/text()')).strip(" ['\\n ']") + result1=str(html.xpath('//table/tr[8]/td[1]/text()')).strip(" ['\\n ']") + return str(result1 + result2).strip('+') def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="center_column"]/div[2]/div[1]/div/div/h2/img/@src')).strip(" ['']") return result def getDirector(a): html = etree.fromstring(a, etree.HTMLParser()) - result = str(html.xpath('//table[2]/tr[7]/td/a/text()')).strip(" ['\\n ']") - return result + result1 = str(html.xpath('//table/tr[2]/td[1]/text()')).strip(" ['\\n ']") + result2 = str(html.xpath('//table/tr[2]/td[1]/a/text()')).strip(" ['\\n ']") + return str(result1 + result2).strip('+') def getOutline(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") return result - def main(number): - htmlcode=get_html('https://www.mgstage.com/product/product_detail/'+str(number)) + htmlcode=get_html('https://www.mgstage.com/product/product_detail/'+str(number),cookies={'adc':'1'}) soup = BeautifulSoup(htmlcode, 'lxml') a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','') + #print(a) dic = { 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), 'studio': getStudio(a), - 'year': getYear(a), + 'year': str(re.search('\d{4}',getRelease(a)).group()), 'outline': getOutline(htmlcode), 'runtime': getRuntime(a), 'director': getDirector(a), @@ -75,7 +79,10 @@ def main(number): 'number': number, 'cover': getCover(htmlcode), 'imagecut': 0, - 'tag':' ', + 'tag': getTag(a).replace("'\\n',",'').replace(' ', '').replace("\\n','\\n",','), + 'label':getLabel(a) } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') - return js \ No newline at end of file + #print('https://www.mgstage.com/product/product_detail/'+str(number)) + return js +#print(main('SIRO-3552')) \ No newline at end of file