Update Pre-release 3.7
This commit is contained in:
@@ -33,7 +33,7 @@ def movie_lists(root, escape_folder):
|
||||
if folder in root:
|
||||
return []
|
||||
total = []
|
||||
file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', ]
|
||||
file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', '.iso','.ISO']
|
||||
dirs = os.listdir(root)
|
||||
for entry in dirs:
|
||||
f = os.path.join(root, entry)
|
||||
@@ -110,7 +110,7 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
version = '3.6'
|
||||
version = '3.7'
|
||||
|
||||
# Parse command line args
|
||||
single_file_path, config_file, auto_exit, custom_number = argparse_function()
|
||||
|
||||
124
WebCrawler/avsox.py
Normal file
124
WebCrawler/avsox.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import re
|
||||
from lxml import etree
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from ADC_function import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
||||
a = soup.find_all(attrs={'class': 'avatar-box'})
|
||||
d = {}
|
||||
for i in a:
|
||||
l = i.img['src']
|
||||
t = i.span.get_text()
|
||||
p2 = {t: l}
|
||||
d.update(p2)
|
||||
return d
|
||||
def getTitle(a):
|
||||
try:
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
|
||||
return result.replace('/', '')
|
||||
except:
|
||||
return ''
|
||||
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
soup = BeautifulSoup(a, 'lxml')
|
||||
a = soup.find_all(attrs={'class': 'avatar-box'})
|
||||
d = []
|
||||
for i in a:
|
||||
d.append(i.span.get_text())
|
||||
return d
|
||||
def getStudio(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
|
||||
return result1
|
||||
def getRuntime(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
|
||||
return result1
|
||||
def getLabel(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
|
||||
return result1
|
||||
def getNum(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
|
||||
return result1
|
||||
def getYear(release):
|
||||
try:
|
||||
result = str(re.search('\d{4}',release).group())
|
||||
return result
|
||||
except:
|
||||
return release
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
|
||||
return result1
|
||||
def getCover(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
|
||||
return result
|
||||
def getCover_small(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
|
||||
return result
|
||||
def getTag(a): # 获取演员
|
||||
soup = BeautifulSoup(a, 'lxml')
|
||||
a = soup.find_all(attrs={'class': 'genre'})
|
||||
d = []
|
||||
for i in a:
|
||||
d.append(i.get_text())
|
||||
return d
|
||||
def getSeries(htmlcode):
|
||||
try:
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
|
||||
return result1
|
||||
except:
|
||||
return ''
|
||||
|
||||
def main(number):
|
||||
a = get_html('https://avsox.host/cn/search/' + number)
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||
a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_'))
|
||||
print(a)
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||
a = get_html('https://avsox.host/cn/search/' + number.replace('_', ''))
|
||||
print(a)
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
||||
web = get_html(result1)
|
||||
soup = BeautifulSoup(web, 'lxml')
|
||||
info = str(soup.find(attrs={'class': 'row movie'}))
|
||||
dic = {
|
||||
'actor': getActor(web),
|
||||
'title': getTitle(web).strip(getNum(web)),
|
||||
'studio': getStudio(info),
|
||||
'outline': '',#
|
||||
'runtime': getRuntime(info),
|
||||
'director': '', #
|
||||
'release': getRelease(info),
|
||||
'number': getNum(info),
|
||||
'cover': getCover(web),
|
||||
'cover_small': getCover_small(a),
|
||||
'imagecut': 3,
|
||||
'tag': getTag(web),
|
||||
'label': getLabel(info),
|
||||
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': getActorPhoto(web),
|
||||
'website': result1,
|
||||
'source': 'avsox.py',
|
||||
'series': getSeries(info),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(main('012717_472'))
|
||||
131
WebCrawler/dlsite.py
Normal file
131
WebCrawler/dlsite.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import re
|
||||
from lxml import etree
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from ADC_function import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html'))
|
||||
#title //*[@id="work_name"]/a/text()
|
||||
#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
|
||||
#release //th[contains(text(),"販売日")]/../td/a/text()
|
||||
#story //th[contains(text(),"シナリオ")]/../td/a/text()
|
||||
#senyo //th[contains(text(),"声優")]/../td/a/text()
|
||||
#tag //th[contains(text(),"ジャンル")]/../td/div/a/text()
|
||||
#jianjie //*[@id="main_inner"]/div[3]/text()
|
||||
#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src
|
||||
|
||||
#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html
|
||||
|
||||
def getTitle(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="work_name"]/a/text()')[0]
|
||||
return result
|
||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
|
||||
return result1
|
||||
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
||||
a = actor.split(',')
|
||||
d={}
|
||||
for i in a:
|
||||
p={i:''}
|
||||
d.update(p)
|
||||
return d
|
||||
def getStudio(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
|
||||
return result
|
||||
def getRuntime(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').rstrip('mi')
|
||||
def getLabel(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
|
||||
return result
|
||||
def getYear(getRelease):
|
||||
try:
|
||||
result = str(re.search('\d{4}', getRelease).group())
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0]
|
||||
return result1.replace('年','-').replace('月','-').replace('日','')
|
||||
def getTag(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()')
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
def getCover_small(a, index=0):
|
||||
# same issue mentioned below,
|
||||
# javdb sometime returns multiple results
|
||||
# DO NOT just get the firt one, get the one with correct index number
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
||||
if not 'https' in result:
|
||||
result = 'https:' + result
|
||||
return result
|
||||
except: # 2020.7.17 Repair Cover Url crawl
|
||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
|
||||
if not 'https' in result:
|
||||
result = 'https:' + result
|
||||
return result
|
||||
def getCover(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0]
|
||||
return result
|
||||
def getDirector(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0]
|
||||
return result
|
||||
def getOutline(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
total = []
|
||||
result = html.xpath('//*[@id="main_inner"]/div[3]/text()')
|
||||
for i in result:
|
||||
total.append(i.strip('\r\n'))
|
||||
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
||||
def getSeries(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
|
||||
return result1
|
||||
def main(number):
|
||||
number = number.upper()
|
||||
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html')
|
||||
|
||||
dic = {
|
||||
'actor': getActor(htmlcode),
|
||||
'title': getTitle(htmlcode),
|
||||
'studio': getStudio(htmlcode),
|
||||
'outline': getOutline(htmlcode),
|
||||
'runtime': getRuntime(htmlcode),
|
||||
'director': getDirector(htmlcode),
|
||||
'release': getRelease(htmlcode),
|
||||
'number': number,
|
||||
'cover': 'https:' + getCover(htmlcode),
|
||||
'cover_small': '',
|
||||
'imagecut': 0,
|
||||
'tag': getTag(htmlcode),
|
||||
'label': getLabel(htmlcode),
|
||||
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': '',
|
||||
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
|
||||
'source': 'dlsite.py',
|
||||
'series': getSeries(htmlcode),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
|
||||
# main('DV-1562')
|
||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
||||
if __name__ == "__main__":
|
||||
print(main('VJ013479'))
|
||||
297
WebCrawler/fanza.py
Normal file
297
WebCrawler/fanza.py
Normal file
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ADC_function import *
|
||||
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
|
||||
def getTitle(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
result = html.xpath('//*[starts-with(@id, "title")]/text()')[0]
|
||||
return result
|
||||
|
||||
|
||||
def getActor(text):
|
||||
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
result = (
|
||||
str(
|
||||
html.xpath(
|
||||
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||
)
|
||||
)
|
||||
.strip(" ['']")
|
||||
.replace("', '", ",")
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def getStudio(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getRuntime(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
|
||||
return re.search(r"\d+", str(result)).group()
|
||||
|
||||
|
||||
def getLabel(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'レーベル:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getNum(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getYear(getRelease):
|
||||
try:
|
||||
result = str(re.search(r"\d{4}", getRelease).group())
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
|
||||
|
||||
def getRelease(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
result = "----"
|
||||
if result == "----":
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'配信開始日:')]/following-sibling::td/text()"
|
||||
)[0].lstrip("\n")
|
||||
except:
|
||||
pass
|
||||
return result.replace("/", "-")
|
||||
|
||||
|
||||
def getTag(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
|
||||
)
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def getCover(text, number):
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
cover_number = number
|
||||
try:
|
||||
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
||||
except:
|
||||
# sometimes fanza modify _ to \u0005f for image id
|
||||
if "_" in cover_number:
|
||||
cover_number = cover_number.replace("_", r"\u005f")
|
||||
try:
|
||||
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
||||
except:
|
||||
# (TODO) handle more edge case
|
||||
# print(html)
|
||||
# raise exception here, same behavior as before
|
||||
# people's major requirement is fetching the picture
|
||||
raise ValueError("can not find image")
|
||||
return result
|
||||
|
||||
|
||||
def getDirector(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
|
||||
|
||||
def getOutline(text):
|
||||
html = etree.fromstring(text, etree.HTMLParser())
|
||||
try:
|
||||
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
|
||||
"\n", ""
|
||||
)
|
||||
if result == "":
|
||||
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
|
||||
"\n", ""
|
||||
)
|
||||
except:
|
||||
# (TODO) handle more edge case
|
||||
# print(html)
|
||||
return ""
|
||||
return result
|
||||
|
||||
|
||||
def getSeries(text):
|
||||
try:
|
||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
|
||||
)[0]
|
||||
except:
|
||||
result = html.xpath(
|
||||
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
|
||||
)[0]
|
||||
return result
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
def main(number):
|
||||
# fanza allow letter + number + underscore, normalize the input here
|
||||
# @note: I only find the usage of underscore as h_test123456789
|
||||
fanza_search_number = number
|
||||
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
||||
if fanza_search_number.startswith("h-"):
|
||||
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
||||
|
||||
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
||||
|
||||
fanza_urls = [
|
||||
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
|
||||
]
|
||||
chosen_url = ""
|
||||
|
||||
for url in fanza_urls:
|
||||
chosen_url = url + fanza_search_number
|
||||
htmlcode = get_html(
|
||||
"https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format(
|
||||
urlencode({"rurl": chosen_url})
|
||||
)
|
||||
)
|
||||
if "404 Not Found" not in htmlcode:
|
||||
break
|
||||
if "404 Not Found" in htmlcode:
|
||||
return json.dumps({"title": "",})
|
||||
try:
|
||||
# for some old page, the input number does not match the page
|
||||
# for example, the url will be cid=test012
|
||||
# but the hinban on the page is test00012
|
||||
# so get the hinban first, and then pass it to following functions
|
||||
fanza_hinban = getNum(htmlcode)
|
||||
data = {
|
||||
"title": getTitle(htmlcode).strip(),
|
||||
"studio": getStudio(htmlcode),
|
||||
"outline": getOutline(htmlcode),
|
||||
"runtime": getRuntime(htmlcode),
|
||||
"director": getDirector(htmlcode) if "anime" not in chosen_url else "",
|
||||
"actor": getActor(htmlcode) if "anime" not in chosen_url else "",
|
||||
"release": getRelease(htmlcode),
|
||||
"number": fanza_hinban,
|
||||
"cover": getCover(htmlcode, fanza_hinban),
|
||||
"imagecut": 1,
|
||||
"tag": getTag(htmlcode),
|
||||
"label": getLabel(htmlcode),
|
||||
"year": getYear(
|
||||
getRelease(htmlcode)
|
||||
), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
"actor_photo": "",
|
||||
"website": chosen_url,
|
||||
"source": "fanza.py",
|
||||
"series": getSeries(htmlcode),
|
||||
}
|
||||
except:
|
||||
data = {
|
||||
"title": "",
|
||||
}
|
||||
js = json.dumps(
|
||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
||||
) # .encode('UTF-8')
|
||||
return js
|
||||
|
||||
|
||||
def main_htmlcode(number):
|
||||
# fanza allow letter + number + underscore, normalize the input here
|
||||
# @note: I only find the usage of underscore as h_test123456789
|
||||
fanza_search_number = number
|
||||
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
||||
if fanza_search_number.startswith("h-"):
|
||||
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
||||
|
||||
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
||||
|
||||
fanza_urls = [
|
||||
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
|
||||
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
|
||||
]
|
||||
chosen_url = ""
|
||||
for url in fanza_urls:
|
||||
chosen_url = url + fanza_search_number
|
||||
htmlcode = get_html(chosen_url)
|
||||
if "404 Not Found" not in htmlcode:
|
||||
break
|
||||
if "404 Not Found" in htmlcode:
|
||||
return json.dumps({"title": "",})
|
||||
return htmlcode
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(main("DV-1562"))
|
||||
print(main("96fad1217"))
|
||||
165
WebCrawler/fc2fans_club.py
Normal file
165
WebCrawler/fc2fans_club.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import re
|
||||
from lxml import etree#need install
|
||||
import json
|
||||
import ADC_function
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
def getTitle(htmlcode): #获取厂商
|
||||
#print(htmlcode)
|
||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']")
|
||||
result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1)
|
||||
#print(result2)
|
||||
return result2
|
||||
def getActor(htmlcode):
|
||||
try:
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']")
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
def getStudio(htmlcode): #获取厂商
|
||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getNum(htmlcode): #获取番号
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
|
||||
#print(result)
|
||||
return result
|
||||
def getRelease(htmlcode2): #
|
||||
#a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
|
||||
html=etree.fromstring(htmlcode2,etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getCover(htmlcode,number,htmlcode2): #获取厂商 #
|
||||
#a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
|
||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']")
|
||||
if result == '':
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']")
|
||||
return 'https://fc2club.com' + result2
|
||||
return 'http:' + result
|
||||
def getOutline(htmlcode2): #获取番号 #
|
||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
|
||||
return result
|
||||
def getTag(htmlcode): #获取番号
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()'))
|
||||
return result.strip(" ['']").replace("'",'').replace(' ','')
|
||||
def getYear(release):
|
||||
try:
|
||||
result = re.search('\d{4}',release).group()
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
def getTitle_fc2com(htmlcode): #获取厂商
|
||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
|
||||
return result
|
||||
def getActor_fc2com(htmlcode):
|
||||
try:
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
def getStudio_fc2com(htmlcode): #获取厂商
|
||||
try:
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
def getNum_fc2com(htmlcode): #获取番号
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getRelease_fc2com(htmlcode2): #
|
||||
html=etree.fromstring(htmlcode2,etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getCover_fc2com(htmlcode2): #获取厂商 #
|
||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
|
||||
return 'http:' + result
|
||||
def getOutline_fc2com(htmlcode2): #获取番号 #
|
||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
|
||||
return result
|
||||
def getTag_fc2com(number): #获取番号
|
||||
htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape'))
|
||||
result = re.findall('"tag":"(.*?)"', htmlcode)
|
||||
return result
|
||||
def getYear_fc2com(release):
|
||||
try:
|
||||
result = re.search('\d{4}',release).group()
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
number = number.replace('FC2-', '').replace('fc2-', '')
|
||||
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/')
|
||||
htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html')
|
||||
actor = getActor(htmlcode)
|
||||
if getActor(htmlcode) == '':
|
||||
actor = 'FC2系列'
|
||||
dic = {
|
||||
'title': getTitle(htmlcode),
|
||||
'studio': getStudio(htmlcode),
|
||||
'year': '',#str(re.search('\d{4}',getRelease(number)).group()),
|
||||
'outline': '',#getOutline(htmlcode2),
|
||||
'runtime': getYear(getRelease(htmlcode)),
|
||||
'director': getStudio(htmlcode),
|
||||
'actor': actor,
|
||||
'release': getRelease(number),
|
||||
'number': 'FC2-'+number,
|
||||
'label': '',
|
||||
'cover': getCover(htmlcode,number,htmlcode2),
|
||||
'imagecut': 0,
|
||||
'tag': getTag(htmlcode),
|
||||
'actor_photo':'',
|
||||
'website': 'https://fc2club.com//html/FC2-' + number + '.html',
|
||||
'source':'https://fc2club.com//html/FC2-' + number + '.html',
|
||||
'series': '',
|
||||
}
|
||||
if dic['title'] == '':
|
||||
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'})
|
||||
actor = getActor(htmlcode)
|
||||
if getActor(htmlcode) == '':
|
||||
actor = 'FC2系列'
|
||||
dic = {
|
||||
'title': getTitle_fc2com(htmlcode2),
|
||||
'studio': getStudio_fc2com(htmlcode2),
|
||||
'year': '', # str(re.search('\d{4}',getRelease(number)).group()),
|
||||
'outline': getOutline_fc2com(htmlcode2),
|
||||
'runtime': getYear_fc2com(getRelease(htmlcode2)),
|
||||
'director': getStudio_fc2com(htmlcode2),
|
||||
'actor': actor,
|
||||
'release': getRelease_fc2com(number),
|
||||
'number': 'FC2-' + number,
|
||||
'cover': getCover_fc2com(htmlcode2),
|
||||
'imagecut': 0,
|
||||
'tag': getTag_fc2com(number),
|
||||
'label': '',
|
||||
'actor_photo': '',
|
||||
'website': 'http://adult.contents.fc2.com/article/' + number + '/',
|
||||
'source': 'http://adult.contents.fc2.com/article/' + number + '/',
|
||||
'series': '',
|
||||
}
|
||||
except Exception as e:
|
||||
# (TODO) better handle this
|
||||
# print(e)
|
||||
dic = {"title": ""}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
|
||||
return js
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('1252953'))
|
||||
156
WebCrawler/jav321.py
Normal file
156
WebCrawler/jav321.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import html
|
||||
from ADC_function import post_html
|
||||
|
||||
|
||||
def main(number: str) -> json:
|
||||
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
|
||||
soup = BeautifulSoup(result.text, "html.parser")
|
||||
lx = html.fromstring(str(soup))
|
||||
|
||||
if "/video/" in result.url:
|
||||
data = parse_info(soup)
|
||||
dic = {
|
||||
"title": get_title(lx),
|
||||
"year": get_year(data),
|
||||
"outline": get_outline(lx),
|
||||
"director": "",
|
||||
"cover": get_cover(lx),
|
||||
"imagecut": 1,
|
||||
"actor_photo": "",
|
||||
"website": result.url,
|
||||
"source": "jav321.py",
|
||||
**data,
|
||||
}
|
||||
else:
|
||||
dic = {}
|
||||
|
||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
||||
|
||||
|
||||
def get_title(lx: html.HtmlElement) -> str:
|
||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
|
||||
|
||||
|
||||
def parse_info(soup: BeautifulSoup) -> dict:
|
||||
data = soup.select_one("div.row > div.col-md-9")
|
||||
|
||||
if data:
|
||||
dd = str(data).split("<br/>")
|
||||
data_dic = {}
|
||||
for d in dd:
|
||||
data_dic[get_bold_text(h=d)] = d
|
||||
|
||||
return {
|
||||
"actor": get_actor(data_dic),
|
||||
"label": get_label(data_dic),
|
||||
"studio": get_studio(data_dic),
|
||||
"tag": get_tag(data_dic),
|
||||
"number": get_number(data_dic),
|
||||
"release": get_release(data_dic),
|
||||
"runtime": get_runtime(data_dic),
|
||||
"series": get_series(data_dic),
|
||||
}
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
def get_bold_text(h: str) -> str:
|
||||
soup = BeautifulSoup(h, "html.parser")
|
||||
if soup.b:
|
||||
return soup.b.text
|
||||
else:
|
||||
return "UNKNOWN_TAG"
|
||||
|
||||
|
||||
def get_anchor_info(h: str) -> str:
|
||||
result = []
|
||||
|
||||
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
|
||||
for d in data:
|
||||
result.append(d.text)
|
||||
|
||||
return ",".join(result)
|
||||
|
||||
|
||||
def get_text_info(h: str) -> str:
|
||||
return h.split(": ")[1]
|
||||
|
||||
|
||||
def get_cover(lx: html.HtmlElement) -> str:
|
||||
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
|
||||
|
||||
|
||||
def get_outline(lx: html.HtmlElement) -> str:
|
||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
|
||||
|
||||
def get_series2(lx: html.HtmlElement) -> str:
|
||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
|
||||
|
||||
|
||||
def get_actor(data: hash) -> str:
|
||||
if "女优" in data:
|
||||
return get_anchor_info(data["女优"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_label(data: hash) -> str:
|
||||
if "片商" in data:
|
||||
return get_anchor_info(data["片商"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_tag(data: hash) -> str:
|
||||
if "标签" in data:
|
||||
return get_anchor_info(data["标签"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_studio(data: hash) -> str:
|
||||
if "片商" in data:
|
||||
return get_anchor_info(data["片商"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_number(data: hash) -> str:
|
||||
if "番号" in data:
|
||||
return get_text_info(data["番号"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_release(data: hash) -> str:
|
||||
if "发行日期" in data:
|
||||
return get_text_info(data["发行日期"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_runtime(data: hash) -> str:
|
||||
if "播放时长" in data:
|
||||
return get_text_info(data["播放时长"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_year(data: hash) -> str:
|
||||
if "release" in data:
|
||||
return data["release"][:4]
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_series(data: hash) -> str:
|
||||
if "系列" in data:
|
||||
return get_anchor_info(data["系列"])
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(main("soe-259"))
|
||||
167
WebCrawler/javbus.py
Normal file
167
WebCrawler/javbus.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import re
|
||||
from pyquery import PyQuery as pq#need install
|
||||
from lxml import etree#need install
|
||||
from bs4 import BeautifulSoup#need install
|
||||
import json
|
||||
from ADC_function import *
|
||||
import fanza
|
||||
|
||||
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
||||
a = soup.find_all(attrs={'class': 'star-name'})
|
||||
d={}
|
||||
for i in a:
|
||||
l=i.a['href']
|
||||
t=i.get_text()
|
||||
html = etree.fromstring(get_html(l), etree.HTMLParser())
|
||||
p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
|
||||
p2={t:p}
|
||||
d.update(p2)
|
||||
return d
|
||||
def getTitle(htmlcode): #获取标题
|
||||
doc = pq(htmlcode)
|
||||
title=str(doc('div.container h3').text()).replace(' ','-')
|
||||
try:
|
||||
title2 = re.sub('n\d+-','',title)
|
||||
return title2
|
||||
except:
|
||||
return title
|
||||
def getStudio(htmlcode): #获取厂商
|
||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
|
||||
return result
|
||||
def getYear(htmlcode): #获取年份
|
||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getCover(htmlcode): #获取封面链接
|
||||
doc = pq(htmlcode)
|
||||
image = doc('a.bigImage')
|
||||
return image.attr('href')
|
||||
def getRelease(htmlcode): #获取出版日期
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getRuntime(htmlcode): #获取分钟
|
||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
||||
a = soup.find(text=re.compile('分鐘'))
|
||||
return a
|
||||
def getActor(htmlcode): #获取女优
|
||||
b=[]
|
||||
soup=BeautifulSoup(htmlcode,'lxml')
|
||||
a=soup.find_all(attrs={'class':'star-name'})
|
||||
for i in a:
|
||||
b.append(i.get_text())
|
||||
return b
|
||||
def getNum(htmlcode): #获取番号
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getDirector(htmlcode): #获取导演
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
|
||||
return result
|
||||
def getCID(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
#print(htmlcode)
|
||||
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
|
||||
result = re.sub('/.*?.jpg','',string)
|
||||
return result
|
||||
def getOutline(htmlcode): #获取演员
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
try:
|
||||
result = html.xpath("string(//div[contains(@class,'mg-b20 lh4')])").replace('\n','')
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
def getSerise(htmlcode):
|
||||
try:
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
def getTag(htmlcode): # 获取演员
|
||||
tag = []
|
||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
||||
a = soup.find_all(attrs={'class': 'genre'})
|
||||
for i in a:
|
||||
if 'onmouseout' in str(i):
|
||||
continue
|
||||
tag.append(i.get_text())
|
||||
return tag
|
||||
|
||||
def main_uncensored(number):
|
||||
htmlcode = get_html('https://www.javbus.com/' + number)
|
||||
if getTitle(htmlcode) == '':
|
||||
htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_'))
|
||||
try:
|
||||
dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
|
||||
except:
|
||||
dww_htmlcode = ''
|
||||
dic = {
|
||||
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
|
||||
'studio': getStudio(htmlcode),
|
||||
'year': getYear(htmlcode),
|
||||
'outline': getOutline(dww_htmlcode),
|
||||
'runtime': getRuntime(htmlcode),
|
||||
'director': getDirector(htmlcode),
|
||||
'actor': getActor(htmlcode),
|
||||
'release': getRelease(htmlcode),
|
||||
'number': getNum(htmlcode),
|
||||
'cover': getCover(htmlcode),
|
||||
'tag': getTag(htmlcode),
|
||||
'label': getSerise(htmlcode),
|
||||
'imagecut': 0,
|
||||
'actor_photo': '',
|
||||
'website': 'https://www.javbus.com/' + number,
|
||||
'source': 'javbus.py',
|
||||
'series': getSerise(htmlcode),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
try:
|
||||
htmlcode = get_html('https://www.javbus.com/' + number)
|
||||
try:
|
||||
dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
|
||||
except:
|
||||
dww_htmlcode = ''
|
||||
dic = {
|
||||
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
|
||||
'studio': getStudio(htmlcode),
|
||||
'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
|
||||
'outline': getOutline(dww_htmlcode),
|
||||
'runtime': getRuntime(htmlcode),
|
||||
'director': getDirector(htmlcode),
|
||||
'actor': getActor(htmlcode),
|
||||
'release': getRelease(htmlcode),
|
||||
'number': getNum(htmlcode),
|
||||
'cover': getCover(htmlcode),
|
||||
'imagecut': 1,
|
||||
'tag': getTag(htmlcode),
|
||||
'label': getSerise(htmlcode),
|
||||
'actor_photo': getActorPhoto(htmlcode),
|
||||
'website': 'https://www.javbus.com/' + number,
|
||||
'source': 'javbus.py',
|
||||
'series': getSerise(htmlcode),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,
|
||||
separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
except:
|
||||
return main_uncensored(number)
|
||||
except:
|
||||
data = {
|
||||
"title": "",
|
||||
}
|
||||
js = json.dumps(
|
||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
||||
)
|
||||
return js
|
||||
|
||||
if __name__ == "__main__" :
|
||||
print(main('ipx-292'))
|
||||
154
WebCrawler/javdb.py
Normal file
154
WebCrawler/javdb.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import re
|
||||
from lxml import etree
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from ADC_function import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
def getTitle(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result = html.xpath("/html/body/section/div/h2/strong/text()")[0]
|
||||
return result
|
||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
|
||||
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
||||
a = actor.split(',')
|
||||
d={}
|
||||
for i in a:
|
||||
p={i:''}
|
||||
d.update(p)
|
||||
return d
|
||||
def getStudio(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
def getRuntime(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').rstrip('mi')
|
||||
def getLabel(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
def getNum(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result2 + result1).strip('+')
|
||||
def getYear(getRelease):
|
||||
try:
|
||||
result = str(re.search('\d{4}', getRelease).group())
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+')
|
||||
def getTag(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
|
||||
return result
|
||||
except:
|
||||
result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
|
||||
return result
|
||||
|
||||
def getCover_small(a, index=0):
|
||||
# same issue mentioned below,
|
||||
# javdb sometime returns multiple results
|
||||
# DO NOT just get the firt one, get the one with correct index number
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
||||
if not 'https' in result:
|
||||
result = 'https:' + result
|
||||
return result
|
||||
except: # 2020.7.17 Repair Cover Url crawl
|
||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
|
||||
if not 'https' in result:
|
||||
result = 'https:' + result
|
||||
return result
|
||||
def getCover(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
try:
|
||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
|
||||
except: # 2020.7.17 Repair Cover Url crawl
|
||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
|
||||
return result
|
||||
def getDirector(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
def getOutline(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
|
||||
return result
|
||||
def getSeries(a):
|
||||
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
def main(number):
|
||||
try:
|
||||
number = number.upper()
|
||||
try:
|
||||
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
|
||||
except:
|
||||
query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all')
|
||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
# javdb sometime returns multiple results,
|
||||
# and the first elememt maybe not the one we are looking for
|
||||
# iterate all candidates and find the match one
|
||||
urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
|
||||
ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
|
||||
correct_url = urls[ids.index(number)]
|
||||
detail_page = get_html('https://javdb.com' + correct_url)
|
||||
|
||||
# If gray image exists ,then replace with normal cover
|
||||
cover_small = getCover_small(query_result, index=ids.index(number))
|
||||
if 'placeholder' in cover_small:
|
||||
cover_small = getCover(detail_page)
|
||||
|
||||
|
||||
dic = {
|
||||
'actor': getActor(detail_page),
|
||||
'title': getTitle(detail_page),
|
||||
'studio': getStudio(detail_page),
|
||||
'outline': getOutline(detail_page),
|
||||
'runtime': getRuntime(detail_page),
|
||||
'director': getDirector(detail_page),
|
||||
'release': getRelease(detail_page),
|
||||
'number': getNum(detail_page),
|
||||
'cover': getCover(detail_page),
|
||||
'cover_small': cover_small,
|
||||
'imagecut': 3,
|
||||
'tag': getTag(detail_page),
|
||||
'label': getLabel(detail_page),
|
||||
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': getActorPhoto(getActor(detail_page)),
|
||||
'website': 'https://javdb.com' + correct_url,
|
||||
'source': 'javdb.py',
|
||||
'series': getSeries(detail_page),
|
||||
}
|
||||
except Exception as e:
|
||||
# print(e)
|
||||
dic = {"title": ""}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
|
||||
# main('DV-1562')
|
||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
||||
if __name__ == "__main__":
|
||||
print(main('snyz-007'))
|
||||
110
WebCrawler/javlib.py
Normal file
110
WebCrawler/javlib.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import json
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import html
|
||||
from http.cookies import SimpleCookie
|
||||
|
||||
from ADC_function import get_javlib_cookie, get_html
|
||||
|
||||
|
||||
def main(number: str):
|
||||
raw_cookies, user_agent = get_javlib_cookie()
|
||||
|
||||
# Blank cookies mean javlib site return error
|
||||
if not raw_cookies:
|
||||
return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
||||
|
||||
# Manually construct a dictionary
|
||||
s_cookie = SimpleCookie()
|
||||
s_cookie.load(raw_cookies)
|
||||
cookies = {}
|
||||
for key, morsel in s_cookie.items():
|
||||
cookies[key] = morsel.value
|
||||
|
||||
# Scraping
|
||||
result = get_html(
|
||||
"http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
|
||||
cookies=cookies,
|
||||
ua=user_agent,
|
||||
return_type="object"
|
||||
)
|
||||
soup = BeautifulSoup(result.text, "html.parser")
|
||||
lx = html.fromstring(str(soup))
|
||||
|
||||
if "/?v=jav" in result.url:
|
||||
dic = {
|
||||
"title": get_title(lx, soup),
|
||||
"studio": get_table_el_single_anchor(soup, "video_maker"),
|
||||
"year": get_table_el_td(soup, "video_date")[:4],
|
||||
"outline": "",
|
||||
"director": get_table_el_single_anchor(soup, "video_director"),
|
||||
"cover": get_cover(lx),
|
||||
"imagecut": 1,
|
||||
"actor_photo": "",
|
||||
"website": result.url,
|
||||
"source": "javlib.py",
|
||||
"actor": get_table_el_multi_anchor(soup, "video_cast"),
|
||||
"label": get_table_el_td(soup, "video_label"),
|
||||
"tag": get_table_el_multi_anchor(soup, "video_genres"),
|
||||
"number": get_table_el_td(soup, "video_id"),
|
||||
"release": get_table_el_td(soup, "video_date"),
|
||||
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
|
||||
"series":'',
|
||||
}
|
||||
else:
|
||||
dic = {}
|
||||
|
||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
||||
|
||||
|
||||
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
|
||||
return lx.xpath(xpath)[0].strip()
|
||||
|
||||
|
||||
def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
||||
tag = soup.find(id=tag_id).find("a")
|
||||
|
||||
if tag is not None:
|
||||
return tag.string.strip()
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
||||
tags = soup.find(id=tag_id).find_all("a")
|
||||
|
||||
return process(tags)
|
||||
|
||||
|
||||
def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
|
||||
tags = soup.find(id=tag_id).find_all("td", class_="text")
|
||||
|
||||
return process(tags)
|
||||
|
||||
|
||||
def process(tags: bs4.element.ResultSet) -> str:
|
||||
values = []
|
||||
for tag in tags:
|
||||
value = tag.string
|
||||
if value is not None and value != "----":
|
||||
values.append(value)
|
||||
|
||||
return ",".join(x for x in values if x)
|
||||
|
||||
|
||||
def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
|
||||
title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
|
||||
number = get_table_el_td(soup, "video_id")
|
||||
|
||||
return title.replace(number, "").strip()
|
||||
|
||||
|
||||
def get_cover(lx: html.HtmlComment) -> str:
|
||||
return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
|
||||
#lists = ["DVMC-003"]
|
||||
for num in lists:
|
||||
print(main(num))
|
||||
120
WebCrawler/mgstage.py
Normal file
120
WebCrawler/mgstage.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import re
|
||||
from lxml import etree
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from ADC_function import *
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
def getTitle(a):
|
||||
try:
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
|
||||
return result.replace('/', ',')
|
||||
except:
|
||||
return ''
|
||||
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
|
||||
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
|
||||
def getStudio(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
|
||||
result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
|
||||
def getRuntime(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
return str(result1 + result2).strip('+').rstrip('mi')
|
||||
def getLabel(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||
def getNum(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+')
|
||||
def getYear(getRelease):
|
||||
try:
|
||||
result = str(re.search('\d{4}',getRelease).group())
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace('/','-')
|
||||
def getTag(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
||||
def getCover(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
|
||||
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
|
||||
return result
|
||||
def getDirector(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||
def getOutline(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
||||
return result
|
||||
def getSeries(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
||||
'\\n')
|
||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||
def main(number2):
|
||||
number=number2.upper()
|
||||
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
|
||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
||||
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
#print(b)
|
||||
dic = {
|
||||
'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''),
|
||||
'studio': getStudio(a),
|
||||
'outline': getOutline(b),
|
||||
'runtime': getRuntime(a),
|
||||
'director': getDirector(a),
|
||||
'actor': getActor(a),
|
||||
'release': getRelease(a),
|
||||
'number': getNum(a),
|
||||
'cover': getCover(htmlcode),
|
||||
'imagecut': 0,
|
||||
'tag': getTag(a),
|
||||
'label':getLabel(a),
|
||||
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': '',
|
||||
'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
|
||||
'source': 'mgstage.py',
|
||||
'series': getSeries(a),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
#print(htmlcode)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('SIRO-4149'))
|
||||
192
WebCrawler/xcity.py
Normal file
192
WebCrawler/xcity.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import re
|
||||
from lxml import etree
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from ADC_function import *
|
||||
|
||||
|
||||
# import sys
|
||||
# import io
|
||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||
|
||||
def getTitle(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
|
||||
return result
|
||||
|
||||
|
||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0]
|
||||
return result1
|
||||
|
||||
|
||||
def getActorPhoto(actor): # //*[@id="star_qdt"]/li/a/img
|
||||
a = actor.split(',')
|
||||
d = {}
|
||||
for i in a:
|
||||
p = {i: ''}
|
||||
d.update(p)
|
||||
return d
|
||||
|
||||
|
||||
def getStudio(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
|
||||
except:
|
||||
result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
|
||||
return result.strip('+').replace("', '", '').replace('"', '')
|
||||
|
||||
|
||||
def getRuntime(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
|
||||
except:
|
||||
return ''
|
||||
try:
|
||||
return re.findall('\d+',result1)[0]
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getLabel(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getNum(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser())
|
||||
try:
|
||||
result = html.xpath('//*[@id="hinban"]/text()')[0]
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getYear(getRelease):
|
||||
try:
|
||||
result = str(re.search('\d{4}', getRelease).group())
|
||||
return result
|
||||
except:
|
||||
return getRelease
|
||||
|
||||
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
|
||||
except:
|
||||
return ''
|
||||
try:
|
||||
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getTag(a):
|
||||
result2=[]
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()')
|
||||
for i in result1:
|
||||
i=i.replace(u'\n','')
|
||||
i=i.replace(u'\t','')
|
||||
result2.append(i)
|
||||
return result2
|
||||
|
||||
|
||||
def getCover_small(a, index=0):
|
||||
# same issue mentioned below,
|
||||
# javdb sometime returns multiple results
|
||||
# DO NOT just get the firt one, get the one with correct index number
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
||||
if not 'https' in result:
|
||||
result = 'https:' + result
|
||||
return result
|
||||
|
||||
|
||||
def getCover(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
try:
|
||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
|
||||
return 'https:' + result
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getDirector(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getOutline(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
try:
|
||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0]
|
||||
except:
|
||||
return ''
|
||||
try:
|
||||
return re.sub('\\\\\w*\d+','',result)
|
||||
except:
|
||||
return result
|
||||
|
||||
def getSeries(htmlcode):
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
try:
|
||||
try:
|
||||
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
|
||||
return result
|
||||
except:
|
||||
result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0]
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
number = number.upper()
|
||||
query_result = get_html(
|
||||
'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
|
||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
|
||||
detail_page = get_html('https://xcity.jp' + urls)
|
||||
dic = {
|
||||
'actor': getActor(detail_page),
|
||||
'title': getTitle(detail_page),
|
||||
'studio': getStudio(detail_page),
|
||||
'outline': getOutline(detail_page),
|
||||
'runtime': getRuntime(detail_page),
|
||||
'director': getDirector(detail_page),
|
||||
'release': getRelease(detail_page),
|
||||
'number': getNum(detail_page),
|
||||
'cover': getCover(detail_page),
|
||||
'cover_small': '',
|
||||
'imagecut': 1,
|
||||
'tag': getTag(detail_page),
|
||||
'label': getLabel(detail_page),
|
||||
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': getActorPhoto(getActor(detail_page)),
|
||||
'website': 'https://xcity.jp' + urls,
|
||||
'source': 'xcity.py',
|
||||
'series': getSeries(detail_page),
|
||||
}
|
||||
except Exception as e:
|
||||
# print(e)
|
||||
dic = {"title": ""}
|
||||
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('VNDS-2624'))
|
||||
5
core.py
5
core.py
@@ -17,6 +17,7 @@ import javdb
|
||||
import mgstage
|
||||
import xcity
|
||||
import javlib
|
||||
import dlsite
|
||||
|
||||
|
||||
def escape_path(path, escape_literals: str): # Remove escape literals
|
||||
@@ -56,6 +57,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON
|
||||
"jav321": jav321.main,
|
||||
"xcity": xcity.main,
|
||||
"javlib": javlib.main,
|
||||
"dlsite": dlsite.main,
|
||||
}
|
||||
|
||||
# default fetch order list, from the beginning to the end
|
||||
@@ -74,6 +76,9 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON
|
||||
elif "fc2" in file_number or "FC2" in file_number:
|
||||
sources.insert(0, sources.pop(sources.index("fc2")))
|
||||
|
||||
elif "RJ" in file_number or "rj" or "VJ" or "vj" in file_number:
|
||||
sources.insert(0, sources.pop(sources.index("dlsite")))
|
||||
|
||||
json_data = {}
|
||||
for source in sources:
|
||||
json_data = json.loads(func_mapping[source](file_number))
|
||||
|
||||
Reference in New Issue
Block a user