Update 3.7-2
This commit is contained in:
124
avsox.py
124
avsox.py
@@ -1,124 +0,0 @@
|
|||||||
import re
|
|
||||||
from lxml import etree
|
|
||||||
import json
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from ADC_function import *
|
|
||||||
# import sys
|
|
||||||
# import io
|
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
|
||||||
|
|
||||||
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
|
||||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
|
||||||
a = soup.find_all(attrs={'class': 'avatar-box'})
|
|
||||||
d = {}
|
|
||||||
for i in a:
|
|
||||||
l = i.img['src']
|
|
||||||
t = i.span.get_text()
|
|
||||||
p2 = {t: l}
|
|
||||||
d.update(p2)
|
|
||||||
return d
|
|
||||||
def getTitle(a):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
|
|
||||||
return result.replace('/', '')
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
|
||||||
soup = BeautifulSoup(a, 'lxml')
|
|
||||||
a = soup.find_all(attrs={'class': 'avatar-box'})
|
|
||||||
d = []
|
|
||||||
for i in a:
|
|
||||||
d.append(i.span.get_text())
|
|
||||||
return d
|
|
||||||
def getStudio(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
|
|
||||||
return result1
|
|
||||||
def getRuntime(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
|
|
||||||
return result1
|
|
||||||
def getLabel(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
|
|
||||||
return result1
|
|
||||||
def getNum(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
|
|
||||||
return result1
|
|
||||||
def getYear(release):
|
|
||||||
try:
|
|
||||||
result = str(re.search('\d{4}',release).group())
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return release
|
|
||||||
def getRelease(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
|
|
||||||
return result1
|
|
||||||
def getCover(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getCover_small(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getTag(a): # 获取演员
|
|
||||||
soup = BeautifulSoup(a, 'lxml')
|
|
||||||
a = soup.find_all(attrs={'class': 'genre'})
|
|
||||||
d = []
|
|
||||||
for i in a:
|
|
||||||
d.append(i.get_text())
|
|
||||||
return d
|
|
||||||
def getSeries(htmlcode):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
|
|
||||||
return result1
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
a = get_html('https://avsox.host/cn/search/' + number)
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
|
||||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
|
||||||
a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_'))
|
|
||||||
print(a)
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
|
||||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
|
||||||
a = get_html('https://avsox.host/cn/search/' + number.replace('_', ''))
|
|
||||||
print(a)
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
|
|
||||||
web = get_html(result1)
|
|
||||||
soup = BeautifulSoup(web, 'lxml')
|
|
||||||
info = str(soup.find(attrs={'class': 'row movie'}))
|
|
||||||
dic = {
|
|
||||||
'actor': getActor(web),
|
|
||||||
'title': getTitle(web).strip(getNum(web)),
|
|
||||||
'studio': getStudio(info),
|
|
||||||
'outline': '',#
|
|
||||||
'runtime': getRuntime(info),
|
|
||||||
'director': '', #
|
|
||||||
'release': getRelease(info),
|
|
||||||
'number': getNum(info),
|
|
||||||
'cover': getCover(web),
|
|
||||||
'cover_small': getCover_small(a),
|
|
||||||
'imagecut': 3,
|
|
||||||
'tag': getTag(web),
|
|
||||||
'label': getLabel(info),
|
|
||||||
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'actor_photo': getActorPhoto(web),
|
|
||||||
'website': result1,
|
|
||||||
'source': 'avsox.py',
|
|
||||||
'series': getSeries(info),
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(main('012717_472'))
|
|
||||||
297
fanza.py
297
fanza.py
@@ -1,297 +0,0 @@
|
|||||||
#!/usr/bin/python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from urllib.parse import urlencode
|
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from ADC_function import *
|
|
||||||
|
|
||||||
# import sys
|
|
||||||
# import io
|
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
|
||||||
|
|
||||||
|
|
||||||
def getTitle(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser())
|
|
||||||
result = html.xpath('//*[starts-with(@id, "title")]/text()')[0]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getActor(text):
|
|
||||||
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser())
|
|
||||||
result = (
|
|
||||||
str(
|
|
||||||
html.xpath(
|
|
||||||
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.strip(" ['']")
|
|
||||||
.replace("', '", ",")
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getStudio(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
|
|
||||||
)[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
|
|
||||||
)[0]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getRuntime(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
|
|
||||||
return re.search(r"\d+", str(result)).group()
|
|
||||||
|
|
||||||
|
|
||||||
def getLabel(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()"
|
|
||||||
)[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'レーベル:')]/following-sibling::td/text()"
|
|
||||||
)[0]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getNum(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
|
|
||||||
)[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
|
|
||||||
)[0]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getYear(getRelease):
|
|
||||||
try:
|
|
||||||
result = str(re.search(r"\d{4}", getRelease).group())
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return getRelease
|
|
||||||
|
|
||||||
|
|
||||||
def getRelease(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
|
|
||||||
)[0].lstrip("\n")
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
|
|
||||||
)[0].lstrip("\n")
|
|
||||||
except:
|
|
||||||
result = "----"
|
|
||||||
if result == "----":
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()"
|
|
||||||
)[0].lstrip("\n")
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'配信開始日:')]/following-sibling::td/text()"
|
|
||||||
)[0].lstrip("\n")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return result.replace("/", "-")
|
|
||||||
|
|
||||||
|
|
||||||
def getTag(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getCover(text, number):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser())
|
|
||||||
cover_number = number
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
|
||||||
except:
|
|
||||||
# sometimes fanza modify _ to \u0005f for image id
|
|
||||||
if "_" in cover_number:
|
|
||||||
cover_number = cover_number.replace("_", r"\u005f")
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
|
||||||
except:
|
|
||||||
# (TODO) handle more edge case
|
|
||||||
# print(html)
|
|
||||||
# raise exception here, same behavior as before
|
|
||||||
# people's major requirement is fetching the picture
|
|
||||||
raise ValueError("can not find image")
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getDirector(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
|
|
||||||
)[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
|
|
||||||
)[0]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getOutline(text):
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
|
|
||||||
"\n", ""
|
|
||||||
)
|
|
||||||
if result == "":
|
|
||||||
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
|
|
||||||
"\n", ""
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
# (TODO) handle more edge case
|
|
||||||
# print(html)
|
|
||||||
return ""
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getSeries(text):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
|
|
||||||
)[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath(
|
|
||||||
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
|
|
||||||
)[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
# fanza allow letter + number + underscore, normalize the input here
|
|
||||||
# @note: I only find the usage of underscore as h_test123456789
|
|
||||||
fanza_search_number = number
|
|
||||||
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
|
||||||
if fanza_search_number.startswith("h-"):
|
|
||||||
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
|
||||||
|
|
||||||
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
|
||||||
|
|
||||||
fanza_urls = [
|
|
||||||
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
|
|
||||||
]
|
|
||||||
chosen_url = ""
|
|
||||||
|
|
||||||
for url in fanza_urls:
|
|
||||||
chosen_url = url + fanza_search_number
|
|
||||||
htmlcode = get_html(
|
|
||||||
"https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format(
|
|
||||||
urlencode({"rurl": chosen_url})
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if "404 Not Found" not in htmlcode:
|
|
||||||
break
|
|
||||||
if "404 Not Found" in htmlcode:
|
|
||||||
return json.dumps({"title": "",})
|
|
||||||
try:
|
|
||||||
# for some old page, the input number does not match the page
|
|
||||||
# for example, the url will be cid=test012
|
|
||||||
# but the hinban on the page is test00012
|
|
||||||
# so get the hinban first, and then pass it to following functions
|
|
||||||
fanza_hinban = getNum(htmlcode)
|
|
||||||
data = {
|
|
||||||
"title": getTitle(htmlcode).strip(),
|
|
||||||
"studio": getStudio(htmlcode),
|
|
||||||
"outline": getOutline(htmlcode),
|
|
||||||
"runtime": getRuntime(htmlcode),
|
|
||||||
"director": getDirector(htmlcode) if "anime" not in chosen_url else "",
|
|
||||||
"actor": getActor(htmlcode) if "anime" not in chosen_url else "",
|
|
||||||
"release": getRelease(htmlcode),
|
|
||||||
"number": fanza_hinban,
|
|
||||||
"cover": getCover(htmlcode, fanza_hinban),
|
|
||||||
"imagecut": 1,
|
|
||||||
"tag": getTag(htmlcode),
|
|
||||||
"label": getLabel(htmlcode),
|
|
||||||
"year": getYear(
|
|
||||||
getRelease(htmlcode)
|
|
||||||
), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": chosen_url,
|
|
||||||
"source": "fanza.py",
|
|
||||||
"series": getSeries(htmlcode),
|
|
||||||
}
|
|
||||||
except:
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
|
|
||||||
def main_htmlcode(number):
|
|
||||||
# fanza allow letter + number + underscore, normalize the input here
|
|
||||||
# @note: I only find the usage of underscore as h_test123456789
|
|
||||||
fanza_search_number = number
|
|
||||||
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
|
||||||
if fanza_search_number.startswith("h-"):
|
|
||||||
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
|
||||||
|
|
||||||
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
|
||||||
|
|
||||||
fanza_urls = [
|
|
||||||
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
|
|
||||||
]
|
|
||||||
chosen_url = ""
|
|
||||||
for url in fanza_urls:
|
|
||||||
chosen_url = url + fanza_search_number
|
|
||||||
htmlcode = get_html(chosen_url)
|
|
||||||
if "404 Not Found" not in htmlcode:
|
|
||||||
break
|
|
||||||
if "404 Not Found" in htmlcode:
|
|
||||||
return json.dumps({"title": "",})
|
|
||||||
return htmlcode
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(main("DV-1562"))
|
|
||||||
print(main("96fad1217"))
|
|
||||||
165
fc2fans_club.py
165
fc2fans_club.py
@@ -1,165 +0,0 @@
|
|||||||
import re
|
|
||||||
from lxml import etree#need install
|
|
||||||
import json
|
|
||||||
import ADC_function
|
|
||||||
# import sys
|
|
||||||
# import io
|
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
|
||||||
|
|
||||||
def getTitle(htmlcode): #获取厂商
|
|
||||||
#print(htmlcode)
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']")
|
|
||||||
result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1)
|
|
||||||
#print(result2)
|
|
||||||
return result2
|
|
||||||
def getActor(htmlcode):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getStudio(htmlcode): #获取厂商
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getNum(htmlcode): #获取番号
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
|
|
||||||
#print(result)
|
|
||||||
return result
|
|
||||||
def getRelease(htmlcode2): #
|
|
||||||
#a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
|
|
||||||
html=etree.fromstring(htmlcode2,etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getCover(htmlcode,number,htmlcode2): #获取厂商 #
|
|
||||||
#a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
|
|
||||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']")
|
|
||||||
if result == '':
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']")
|
|
||||||
return 'https://fc2club.com' + result2
|
|
||||||
return 'http:' + result
|
|
||||||
def getOutline(htmlcode2): #获取番号 #
|
|
||||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
|
|
||||||
return result
|
|
||||||
def getTag(htmlcode): #获取番号
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()'))
|
|
||||||
return result.strip(" ['']").replace("'",'').replace(' ','')
|
|
||||||
def getYear(release):
|
|
||||||
try:
|
|
||||||
result = re.search('\d{4}',release).group()
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getTitle_fc2com(htmlcode): #获取厂商
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
|
|
||||||
return result
|
|
||||||
def getActor_fc2com(htmlcode):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getStudio_fc2com(htmlcode): #获取厂商
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getNum_fc2com(htmlcode): #获取番号
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getRelease_fc2com(htmlcode2): #
|
|
||||||
html=etree.fromstring(htmlcode2,etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getCover_fc2com(htmlcode2): #获取厂商 #
|
|
||||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
|
|
||||||
return 'http:' + result
|
|
||||||
def getOutline_fc2com(htmlcode2): #获取番号 #
|
|
||||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
|
|
||||||
return result
|
|
||||||
def getTag_fc2com(number): #获取番号
|
|
||||||
htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape'))
|
|
||||||
result = re.findall('"tag":"(.*?)"', htmlcode)
|
|
||||||
return result
|
|
||||||
def getYear_fc2com(release):
|
|
||||||
try:
|
|
||||||
result = re.search('\d{4}',release).group()
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
number = number.replace('FC2-', '').replace('fc2-', '')
|
|
||||||
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/')
|
|
||||||
htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html')
|
|
||||||
actor = getActor(htmlcode)
|
|
||||||
if getActor(htmlcode) == '':
|
|
||||||
actor = 'FC2系列'
|
|
||||||
dic = {
|
|
||||||
'title': getTitle(htmlcode),
|
|
||||||
'studio': getStudio(htmlcode),
|
|
||||||
'year': '',#str(re.search('\d{4}',getRelease(number)).group()),
|
|
||||||
'outline': '',#getOutline(htmlcode2),
|
|
||||||
'runtime': getYear(getRelease(htmlcode)),
|
|
||||||
'director': getStudio(htmlcode),
|
|
||||||
'actor': actor,
|
|
||||||
'release': getRelease(number),
|
|
||||||
'number': 'FC2-'+number,
|
|
||||||
'label': '',
|
|
||||||
'cover': getCover(htmlcode,number,htmlcode2),
|
|
||||||
'imagecut': 0,
|
|
||||||
'tag': getTag(htmlcode),
|
|
||||||
'actor_photo':'',
|
|
||||||
'website': 'https://fc2club.com//html/FC2-' + number + '.html',
|
|
||||||
'source':'https://fc2club.com//html/FC2-' + number + '.html',
|
|
||||||
'series': '',
|
|
||||||
}
|
|
||||||
if dic['title'] == '':
|
|
||||||
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'})
|
|
||||||
actor = getActor(htmlcode)
|
|
||||||
if getActor(htmlcode) == '':
|
|
||||||
actor = 'FC2系列'
|
|
||||||
dic = {
|
|
||||||
'title': getTitle_fc2com(htmlcode2),
|
|
||||||
'studio': getStudio_fc2com(htmlcode2),
|
|
||||||
'year': '', # str(re.search('\d{4}',getRelease(number)).group()),
|
|
||||||
'outline': getOutline_fc2com(htmlcode2),
|
|
||||||
'runtime': getYear_fc2com(getRelease(htmlcode2)),
|
|
||||||
'director': getStudio_fc2com(htmlcode2),
|
|
||||||
'actor': actor,
|
|
||||||
'release': getRelease_fc2com(number),
|
|
||||||
'number': 'FC2-' + number,
|
|
||||||
'cover': getCover_fc2com(htmlcode2),
|
|
||||||
'imagecut': 0,
|
|
||||||
'tag': getTag_fc2com(number),
|
|
||||||
'label': '',
|
|
||||||
'actor_photo': '',
|
|
||||||
'website': 'http://adult.contents.fc2.com/article/' + number + '/',
|
|
||||||
'source': 'http://adult.contents.fc2.com/article/' + number + '/',
|
|
||||||
'series': '',
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# (TODO) better handle this
|
|
||||||
# print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
print(main('1252953'))
|
|
||||||
156
jav321.py
156
jav321.py
@@ -1,156 +0,0 @@
|
|||||||
import json
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from lxml import html
|
|
||||||
from ADC_function import post_html
|
|
||||||
|
|
||||||
|
|
||||||
def main(number: str) -> json:
|
|
||||||
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
|
|
||||||
soup = BeautifulSoup(result.text, "html.parser")
|
|
||||||
lx = html.fromstring(str(soup))
|
|
||||||
|
|
||||||
if "/video/" in result.url:
|
|
||||||
data = parse_info(soup)
|
|
||||||
dic = {
|
|
||||||
"title": get_title(lx),
|
|
||||||
"year": get_year(data),
|
|
||||||
"outline": get_outline(lx),
|
|
||||||
"director": "",
|
|
||||||
"cover": get_cover(lx),
|
|
||||||
"imagecut": 1,
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": result.url,
|
|
||||||
"source": "jav321.py",
|
|
||||||
**data,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
dic = {}
|
|
||||||
|
|
||||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
|
|
||||||
def get_title(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
|
|
||||||
|
|
||||||
|
|
||||||
def parse_info(soup: BeautifulSoup) -> dict:
|
|
||||||
data = soup.select_one("div.row > div.col-md-9")
|
|
||||||
|
|
||||||
if data:
|
|
||||||
dd = str(data).split("<br/>")
|
|
||||||
data_dic = {}
|
|
||||||
for d in dd:
|
|
||||||
data_dic[get_bold_text(h=d)] = d
|
|
||||||
|
|
||||||
return {
|
|
||||||
"actor": get_actor(data_dic),
|
|
||||||
"label": get_label(data_dic),
|
|
||||||
"studio": get_studio(data_dic),
|
|
||||||
"tag": get_tag(data_dic),
|
|
||||||
"number": get_number(data_dic),
|
|
||||||
"release": get_release(data_dic),
|
|
||||||
"runtime": get_runtime(data_dic),
|
|
||||||
"series": get_series(data_dic),
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
def get_bold_text(h: str) -> str:
|
|
||||||
soup = BeautifulSoup(h, "html.parser")
|
|
||||||
if soup.b:
|
|
||||||
return soup.b.text
|
|
||||||
else:
|
|
||||||
return "UNKNOWN_TAG"
|
|
||||||
|
|
||||||
|
|
||||||
def get_anchor_info(h: str) -> str:
|
|
||||||
result = []
|
|
||||||
|
|
||||||
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
|
|
||||||
for d in data:
|
|
||||||
result.append(d.text)
|
|
||||||
|
|
||||||
return ",".join(result)
|
|
||||||
|
|
||||||
|
|
||||||
def get_text_info(h: str) -> str:
|
|
||||||
return h.split(": ")[1]
|
|
||||||
|
|
||||||
|
|
||||||
def get_cover(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
|
|
||||||
|
|
||||||
|
|
||||||
def get_outline(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
|
|
||||||
|
|
||||||
def get_series2(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
|
|
||||||
|
|
||||||
|
|
||||||
def get_actor(data: hash) -> str:
|
|
||||||
if "女优" in data:
|
|
||||||
return get_anchor_info(data["女优"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_label(data: hash) -> str:
|
|
||||||
if "片商" in data:
|
|
||||||
return get_anchor_info(data["片商"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_tag(data: hash) -> str:
|
|
||||||
if "标签" in data:
|
|
||||||
return get_anchor_info(data["标签"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_studio(data: hash) -> str:
|
|
||||||
if "片商" in data:
|
|
||||||
return get_anchor_info(data["片商"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_number(data: hash) -> str:
|
|
||||||
if "番号" in data:
|
|
||||||
return get_text_info(data["番号"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_release(data: hash) -> str:
|
|
||||||
if "发行日期" in data:
|
|
||||||
return get_text_info(data["发行日期"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_runtime(data: hash) -> str:
|
|
||||||
if "播放时长" in data:
|
|
||||||
return get_text_info(data["播放时长"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_year(data: hash) -> str:
|
|
||||||
if "release" in data:
|
|
||||||
return data["release"][:4]
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_series(data: hash) -> str:
|
|
||||||
if "系列" in data:
|
|
||||||
return get_anchor_info(data["系列"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(main("soe-259"))
|
|
||||||
167
javbus.py
167
javbus.py
@@ -1,167 +0,0 @@
|
|||||||
import re
|
|
||||||
from pyquery import PyQuery as pq#need install
|
|
||||||
from lxml import etree#need install
|
|
||||||
from bs4 import BeautifulSoup#need install
|
|
||||||
import json
|
|
||||||
from ADC_function import *
|
|
||||||
import fanza
|
|
||||||
|
|
||||||
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
|
||||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
|
||||||
a = soup.find_all(attrs={'class': 'star-name'})
|
|
||||||
d={}
|
|
||||||
for i in a:
|
|
||||||
l=i.a['href']
|
|
||||||
t=i.get_text()
|
|
||||||
html = etree.fromstring(get_html(l), etree.HTMLParser())
|
|
||||||
p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
|
|
||||||
p2={t:p}
|
|
||||||
d.update(p2)
|
|
||||||
return d
|
|
||||||
def getTitle(htmlcode): #获取标题
|
|
||||||
doc = pq(htmlcode)
|
|
||||||
title=str(doc('div.container h3').text()).replace(' ','-')
|
|
||||||
try:
|
|
||||||
title2 = re.sub('n\d+-','',title)
|
|
||||||
return title2
|
|
||||||
except:
|
|
||||||
return title
|
|
||||||
def getStudio(htmlcode): #获取厂商
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getYear(htmlcode): #获取年份
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getCover(htmlcode): #获取封面链接
|
|
||||||
doc = pq(htmlcode)
|
|
||||||
image = doc('a.bigImage')
|
|
||||||
return image.attr('href')
|
|
||||||
def getRelease(htmlcode): #获取出版日期
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getRuntime(htmlcode): #获取分钟
|
|
||||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
|
||||||
a = soup.find(text=re.compile('分鐘'))
|
|
||||||
return a
|
|
||||||
def getActor(htmlcode): #获取女优
|
|
||||||
b=[]
|
|
||||||
soup=BeautifulSoup(htmlcode,'lxml')
|
|
||||||
a=soup.find_all(attrs={'class':'star-name'})
|
|
||||||
for i in a:
|
|
||||||
b.append(i.get_text())
|
|
||||||
return b
|
|
||||||
def getNum(htmlcode): #获取番号
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getDirector(htmlcode): #获取导演
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getCID(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
#print(htmlcode)
|
|
||||||
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
|
|
||||||
result = re.sub('/.*?.jpg','',string)
|
|
||||||
return result
|
|
||||||
def getOutline(htmlcode): #获取演员
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = html.xpath("string(//div[contains(@class,'mg-b20 lh4')])").replace('\n','')
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getSerise(htmlcode):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getTag(htmlcode): # 获取演员
|
|
||||||
tag = []
|
|
||||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
|
||||||
a = soup.find_all(attrs={'class': 'genre'})
|
|
||||||
for i in a:
|
|
||||||
if 'onmouseout' in str(i):
|
|
||||||
continue
|
|
||||||
tag.append(i.get_text())
|
|
||||||
return tag
|
|
||||||
|
|
||||||
def main_uncensored(number):
|
|
||||||
htmlcode = get_html('https://www.javbus.com/' + number)
|
|
||||||
if getTitle(htmlcode) == '':
|
|
||||||
htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_'))
|
|
||||||
try:
|
|
||||||
dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
|
|
||||||
except:
|
|
||||||
dww_htmlcode = ''
|
|
||||||
dic = {
|
|
||||||
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
|
|
||||||
'studio': getStudio(htmlcode),
|
|
||||||
'year': getYear(htmlcode),
|
|
||||||
'outline': getOutline(dww_htmlcode),
|
|
||||||
'runtime': getRuntime(htmlcode),
|
|
||||||
'director': getDirector(htmlcode),
|
|
||||||
'actor': getActor(htmlcode),
|
|
||||||
'release': getRelease(htmlcode),
|
|
||||||
'number': getNum(htmlcode),
|
|
||||||
'cover': getCover(htmlcode),
|
|
||||||
'tag': getTag(htmlcode),
|
|
||||||
'label': getSerise(htmlcode),
|
|
||||||
'imagecut': 0,
|
|
||||||
'actor_photo': '',
|
|
||||||
'website': 'https://www.javbus.com/' + number,
|
|
||||||
'source': 'javbus.py',
|
|
||||||
'series': getSerise(htmlcode),
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
htmlcode = get_html('https://www.javbus.com/' + number)
|
|
||||||
try:
|
|
||||||
dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
|
|
||||||
except:
|
|
||||||
dww_htmlcode = ''
|
|
||||||
dic = {
|
|
||||||
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
|
|
||||||
'studio': getStudio(htmlcode),
|
|
||||||
'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
|
|
||||||
'outline': getOutline(dww_htmlcode),
|
|
||||||
'runtime': getRuntime(htmlcode),
|
|
||||||
'director': getDirector(htmlcode),
|
|
||||||
'actor': getActor(htmlcode),
|
|
||||||
'release': getRelease(htmlcode),
|
|
||||||
'number': getNum(htmlcode),
|
|
||||||
'cover': getCover(htmlcode),
|
|
||||||
'imagecut': 1,
|
|
||||||
'tag': getTag(htmlcode),
|
|
||||||
'label': getSerise(htmlcode),
|
|
||||||
'actor_photo': getActorPhoto(htmlcode),
|
|
||||||
'website': 'https://www.javbus.com/' + number,
|
|
||||||
'source': 'javbus.py',
|
|
||||||
'series': getSerise(htmlcode),
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,
|
|
||||||
separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
except:
|
|
||||||
return main_uncensored(number)
|
|
||||||
except:
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
)
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == "__main__" :
|
|
||||||
print(main('ipx-292'))
|
|
||||||
154
javdb.py
154
javdb.py
@@ -1,154 +0,0 @@
|
|||||||
import re
|
|
||||||
from lxml import etree
|
|
||||||
import json
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from ADC_function import *
|
|
||||||
# import sys
|
|
||||||
# import io
|
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
|
||||||
|
|
||||||
def getTitle(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
result = html.xpath("/html/body/section/div/h2/strong/text()")[0]
|
|
||||||
return result
|
|
||||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
|
|
||||||
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
|
||||||
a = actor.split(',')
|
|
||||||
d={}
|
|
||||||
for i in a:
|
|
||||||
p={i:''}
|
|
||||||
d.update(p)
|
|
||||||
return d
|
|
||||||
def getStudio(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def getRuntime(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').rstrip('mi')
|
|
||||||
def getLabel(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def getNum(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result2 + result1).strip('+')
|
|
||||||
def getYear(getRelease):
|
|
||||||
try:
|
|
||||||
result = str(re.search('\d{4}', getRelease).group())
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return getRelease
|
|
||||||
def getRelease(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+')
|
|
||||||
def getTag(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getCover_small(a, index=0):
|
|
||||||
# same issue mentioned below,
|
|
||||||
# javdb sometime returns multiple results
|
|
||||||
# DO NOT just get the firt one, get the one with correct index number
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
except: # 2020.7.17 Repair Cover Url crawl
|
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
def getCover(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
|
|
||||||
except: # 2020.7.17 Repair Cover Url crawl
|
|
||||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
|
|
||||||
return result
|
|
||||||
def getDirector(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def getOutline(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getSeries(a):
|
|
||||||
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
number = number.upper()
|
|
||||||
try:
|
|
||||||
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
|
|
||||||
except:
|
|
||||||
query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all')
|
|
||||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
# javdb sometime returns multiple results,
|
|
||||||
# and the first elememt maybe not the one we are looking for
|
|
||||||
# iterate all candidates and find the match one
|
|
||||||
urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
|
|
||||||
ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
|
|
||||||
correct_url = urls[ids.index(number)]
|
|
||||||
detail_page = get_html('https://javdb.com' + correct_url)
|
|
||||||
|
|
||||||
# If gray image exists ,then replace with normal cover
|
|
||||||
cover_small = getCover_small(query_result, index=ids.index(number))
|
|
||||||
if 'placeholder' in cover_small:
|
|
||||||
cover_small = getCover(detail_page)
|
|
||||||
|
|
||||||
|
|
||||||
dic = {
|
|
||||||
'actor': getActor(detail_page),
|
|
||||||
'title': getTitle(detail_page),
|
|
||||||
'studio': getStudio(detail_page),
|
|
||||||
'outline': getOutline(detail_page),
|
|
||||||
'runtime': getRuntime(detail_page),
|
|
||||||
'director': getDirector(detail_page),
|
|
||||||
'release': getRelease(detail_page),
|
|
||||||
'number': getNum(detail_page),
|
|
||||||
'cover': getCover(detail_page),
|
|
||||||
'cover_small': cover_small,
|
|
||||||
'imagecut': 3,
|
|
||||||
'tag': getTag(detail_page),
|
|
||||||
'label': getLabel(detail_page),
|
|
||||||
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'actor_photo': getActorPhoto(getActor(detail_page)),
|
|
||||||
'website': 'https://javdb.com' + correct_url,
|
|
||||||
'source': 'javdb.py',
|
|
||||||
'series': getSeries(detail_page),
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
# main('DV-1562')
|
|
||||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(main('snyz-007'))
|
|
||||||
110
javlib.py
110
javlib.py
@@ -1,110 +0,0 @@
|
|||||||
import json
|
|
||||||
import bs4
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from lxml import html
|
|
||||||
from http.cookies import SimpleCookie
|
|
||||||
|
|
||||||
from ADC_function import get_javlib_cookie, get_html
|
|
||||||
|
|
||||||
|
|
||||||
def main(number: str):
|
|
||||||
raw_cookies, user_agent = get_javlib_cookie()
|
|
||||||
|
|
||||||
# Blank cookies mean javlib site return error
|
|
||||||
if not raw_cookies:
|
|
||||||
return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
# Manually construct a dictionary
|
|
||||||
s_cookie = SimpleCookie()
|
|
||||||
s_cookie.load(raw_cookies)
|
|
||||||
cookies = {}
|
|
||||||
for key, morsel in s_cookie.items():
|
|
||||||
cookies[key] = morsel.value
|
|
||||||
|
|
||||||
# Scraping
|
|
||||||
result = get_html(
|
|
||||||
"http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
|
|
||||||
cookies=cookies,
|
|
||||||
ua=user_agent,
|
|
||||||
return_type="object"
|
|
||||||
)
|
|
||||||
soup = BeautifulSoup(result.text, "html.parser")
|
|
||||||
lx = html.fromstring(str(soup))
|
|
||||||
|
|
||||||
if "/?v=jav" in result.url:
|
|
||||||
dic = {
|
|
||||||
"title": get_title(lx, soup),
|
|
||||||
"studio": get_table_el_single_anchor(soup, "video_maker"),
|
|
||||||
"year": get_table_el_td(soup, "video_date")[:4],
|
|
||||||
"outline": "",
|
|
||||||
"director": get_table_el_single_anchor(soup, "video_director"),
|
|
||||||
"cover": get_cover(lx),
|
|
||||||
"imagecut": 1,
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": result.url,
|
|
||||||
"source": "javlib.py",
|
|
||||||
"actor": get_table_el_multi_anchor(soup, "video_cast"),
|
|
||||||
"label": get_table_el_td(soup, "video_label"),
|
|
||||||
"tag": get_table_el_multi_anchor(soup, "video_genres"),
|
|
||||||
"number": get_table_el_td(soup, "video_id"),
|
|
||||||
"release": get_table_el_td(soup, "video_date"),
|
|
||||||
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
|
|
||||||
"series":'',
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
dic = {}
|
|
||||||
|
|
||||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
|
|
||||||
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
|
|
||||||
return lx.xpath(xpath)[0].strip()
|
|
||||||
|
|
||||||
|
|
||||||
def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
|
||||||
tag = soup.find(id=tag_id).find("a")
|
|
||||||
|
|
||||||
if tag is not None:
|
|
||||||
return tag.string.strip()
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
|
||||||
tags = soup.find(id=tag_id).find_all("a")
|
|
||||||
|
|
||||||
return process(tags)
|
|
||||||
|
|
||||||
|
|
||||||
def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
|
|
||||||
tags = soup.find(id=tag_id).find_all("td", class_="text")
|
|
||||||
|
|
||||||
return process(tags)
|
|
||||||
|
|
||||||
|
|
||||||
def process(tags: bs4.element.ResultSet) -> str:
|
|
||||||
values = []
|
|
||||||
for tag in tags:
|
|
||||||
value = tag.string
|
|
||||||
if value is not None and value != "----":
|
|
||||||
values.append(value)
|
|
||||||
|
|
||||||
return ",".join(x for x in values if x)
|
|
||||||
|
|
||||||
|
|
||||||
def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
|
|
||||||
title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
|
|
||||||
number = get_table_el_td(soup, "video_id")
|
|
||||||
|
|
||||||
return title.replace(number, "").strip()
|
|
||||||
|
|
||||||
|
|
||||||
def get_cover(lx: html.HtmlComment) -> str:
|
|
||||||
return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
|
|
||||||
#lists = ["DVMC-003"]
|
|
||||||
for num in lists:
|
|
||||||
print(main(num))
|
|
||||||
120
mgstage.py
120
mgstage.py
@@ -1,120 +0,0 @@
|
|||||||
import re
|
|
||||||
from lxml import etree
|
|
||||||
import json
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from ADC_function import *
|
|
||||||
# import sys
|
|
||||||
# import io
|
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
|
||||||
|
|
||||||
def getTitle(a):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
|
|
||||||
return result.replace('/', ',')
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
|
|
||||||
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
|
|
||||||
def getStudio(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
|
|
||||||
result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
|
|
||||||
def getRuntime(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
return str(result1 + result2).strip('+').rstrip('mi')
|
|
||||||
def getLabel(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
|
||||||
def getNum(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
return str(result1 + result2).strip('+')
|
|
||||||
def getYear(getRelease):
|
|
||||||
try:
|
|
||||||
result = str(re.search('\d{4}',getRelease).group())
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return getRelease
|
|
||||||
def getRelease(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
return str(result1 + result2).strip('+').replace('/','-')
|
|
||||||
def getTag(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
|
||||||
def getCover(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
|
|
||||||
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
|
|
||||||
return result
|
|
||||||
def getDirector(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
|
||||||
def getOutline(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
|
||||||
return result
|
|
||||||
def getSeries(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
|
|
||||||
'\\n')
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def main(number2):
|
|
||||||
number=number2.upper()
|
|
||||||
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
|
|
||||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
|
||||||
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
|
||||||
b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
|
||||||
#print(b)
|
|
||||||
dic = {
|
|
||||||
'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''),
|
|
||||||
'studio': getStudio(a),
|
|
||||||
'outline': getOutline(b),
|
|
||||||
'runtime': getRuntime(a),
|
|
||||||
'director': getDirector(a),
|
|
||||||
'actor': getActor(a),
|
|
||||||
'release': getRelease(a),
|
|
||||||
'number': getNum(a),
|
|
||||||
'cover': getCover(htmlcode),
|
|
||||||
'imagecut': 0,
|
|
||||||
'tag': getTag(a),
|
|
||||||
'label':getLabel(a),
|
|
||||||
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'actor_photo': '',
|
|
||||||
'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
|
|
||||||
'source': 'mgstage.py',
|
|
||||||
'series': getSeries(a),
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
#print(htmlcode)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
print(main('SIRO-4149'))
|
|
||||||
192
xcity.py
192
xcity.py
@@ -1,192 +0,0 @@
|
|||||||
import re
|
|
||||||
from lxml import etree
|
|
||||||
import json
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from ADC_function import *
|
|
||||||
|
|
||||||
|
|
||||||
# import sys
|
|
||||||
# import io
|
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
|
||||||
|
|
||||||
def getTitle(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0]
|
|
||||||
return result1
|
|
||||||
|
|
||||||
|
|
||||||
def getActorPhoto(actor): # //*[@id="star_qdt"]/li/a/img
|
|
||||||
a = actor.split(',')
|
|
||||||
d = {}
|
|
||||||
for i in a:
|
|
||||||
p = {i: ''}
|
|
||||||
d.update(p)
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def getStudio(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
|
|
||||||
except:
|
|
||||||
result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
|
|
||||||
return result.strip('+').replace("', '", '').replace('"', '')
|
|
||||||
|
|
||||||
|
|
||||||
def getRuntime(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
try:
|
|
||||||
return re.findall('\d+',result1)[0]
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getLabel(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getNum(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="hinban"]/text()')[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getYear(getRelease):
|
|
||||||
try:
|
|
||||||
result = str(re.search('\d{4}', getRelease).group())
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return getRelease
|
|
||||||
|
|
||||||
|
|
||||||
def getRelease(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
try:
|
|
||||||
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getTag(a):
|
|
||||||
result2=[]
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()')
|
|
||||||
for i in result1:
|
|
||||||
i=i.replace(u'\n','')
|
|
||||||
i=i.replace(u'\t','')
|
|
||||||
result2.append(i)
|
|
||||||
return result2
|
|
||||||
|
|
||||||
|
|
||||||
def getCover_small(a, index=0):
|
|
||||||
# same issue mentioned below,
|
|
||||||
# javdb sometime returns multiple results
|
|
||||||
# DO NOT just get the firt one, get the one with correct index number
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getCover(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
|
|
||||||
return 'https:' + result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getDirector(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getOutline(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0]
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
try:
|
|
||||||
return re.sub('\\\\\w*\d+','',result)
|
|
||||||
except:
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getSeries(htmlcode):
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
number = number.upper()
|
|
||||||
query_result = get_html(
|
|
||||||
'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
|
|
||||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
|
|
||||||
detail_page = get_html('https://xcity.jp' + urls)
|
|
||||||
dic = {
|
|
||||||
'actor': getActor(detail_page),
|
|
||||||
'title': getTitle(detail_page),
|
|
||||||
'studio': getStudio(detail_page),
|
|
||||||
'outline': getOutline(detail_page),
|
|
||||||
'runtime': getRuntime(detail_page),
|
|
||||||
'director': getDirector(detail_page),
|
|
||||||
'release': getRelease(detail_page),
|
|
||||||
'number': getNum(detail_page),
|
|
||||||
'cover': getCover(detail_page),
|
|
||||||
'cover_small': '',
|
|
||||||
'imagecut': 1,
|
|
||||||
'tag': getTag(detail_page),
|
|
||||||
'label': getLabel(detail_page),
|
|
||||||
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'actor_photo': getActorPhoto(getActor(detail_page)),
|
|
||||||
'website': 'https://xcity.jp' + urls,
|
|
||||||
'source': 'xcity.py',
|
|
||||||
'series': getSeries(detail_page),
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
print(main('VNDS-2624'))
|
|
||||||
Reference in New Issue
Block a user