Update 3.5

This commit is contained in:
Yoshiko2
2020-06-21 23:53:08 +08:00
committed by GitHub
parent b016113fc1
commit 2c6169b340
10 changed files with 117 additions and 42 deletions

View File

@@ -86,7 +86,7 @@ def create_data_and_move(file_path: str, c: config.Config):
if __name__ == '__main__': if __name__ == '__main__':
version = '3.4.3' version = '3.5'
# Parse command line args # Parse command line args
single_file_path, config_file, auto_exit = argparse_function() single_file_path, config_file, auto_exit = argparse_function()

View File

@@ -72,6 +72,13 @@ def getTag(a): # 获取演员
for i in a: for i in a:
d.append(i.get_text()) d.append(i.get_text())
return d return d
def getSeries(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1
except:
return ''
def main(number): def main(number):
a = get_html('https://avsox.host/cn/search/' + number) a = get_html('https://avsox.host/cn/search/' + number)
@@ -108,8 +115,10 @@ def main(number):
'actor_photo': getActorPhoto(web), 'actor_photo': getActorPhoto(web),
'website': result1, 'website': result1,
'source': 'avsox.py', 'source': 'avsox.py',
'series': getSeries(info),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
#print(main('012717_472')) if __name__ == "__main__":
print(main('012717_472'))

View File

@@ -98,6 +98,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON
runtime = json_data['runtime'] runtime = json_data['runtime']
outline = json_data['outline'] outline = json_data['outline']
label = json_data['label'] label = json_data['label']
series = json_data['series']
year = json_data['year'] year = json_data['year']
try: try:
cover_small = json_data['cover_small'] cover_small = json_data['cover_small']
@@ -166,7 +167,8 @@ def get_info(json_data): # 返回json里的数据
number = json_data['number'] number = json_data['number']
cover = json_data['cover'] cover = json_data['cover']
website = json_data['website'] website = json_data['website']
return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website series = json_data['series']
return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series
def small_cover_check(path, number, cover_small, c_word, conf: config.Config, filepath, failed_folder): def small_cover_check(path, number, cover_small, c_word, conf: config.Config, filepath, failed_folder):
@@ -263,7 +265,7 @@ def image_download(cover, number, c_word, path, conf: config.Config, filepath, f
def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, failed_folder, tag, actor_list, liuchu): def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, failed_folder, tag, actor_list, liuchu):
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website = get_info(json_data) title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series = get_info(json_data)
try: try:
if not os.path.exists(path): if not os.path.exists(path):
@@ -300,6 +302,7 @@ def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, fa
try: try:
for i in tag: for i in tag:
print(" <tag>" + i + "</tag>", file=code) print(" <tag>" + i + "</tag>", file=code)
print(" <tag>" + series + "</tag>", file=code)
except: except:
aaaaa = '' aaaaa = ''
try: try:

View File

@@ -108,7 +108,7 @@ def getRelease(text):
)[0].lstrip("\n") )[0].lstrip("\n")
except: except:
pass pass
return result return result.replace('/','-')
def getTag(text): def getTag(text):
@@ -174,6 +174,23 @@ def getOutline(text):
return result return result
def getSeries(text):
try:
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
)[0]
return result
except:
return ''
def main(number): def main(number):
# fanza allow letter + number + underscore, normalize the input here # fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789 # @note: I only find the usage of underscore as h_test123456789
@@ -225,6 +242,7 @@ def main(number):
"actor_photo": "", "actor_photo": "",
"website": chosen_url, "website": chosen_url,
"source": "fanza.py", "source": "fanza.py",
"series": getSeries(htmlcode),
} }
except: except:
data = { data = {
@@ -266,7 +284,4 @@ def main_htmlcode(number):
if __name__ == "__main__": if __name__ == "__main__":
# print(main("DV-1562")) print(main("DV-1562"))
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
# print(main("ipx292"))
pass

View File

@@ -159,5 +159,5 @@ def main(number):
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
return js return js
if __name__ == '__main__':
#print(main('1252953')) print(main('1252953'))

View File

@@ -13,7 +13,6 @@ def main(number: str) -> json:
data = parse_info(soup) data = parse_info(soup)
dic = { dic = {
"title": get_title(lx), "title": get_title(lx),
"studio": "",
"year": get_year(data), "year": get_year(data),
"outline": get_outline(lx), "outline": get_outline(lx),
"director": "", "director": "",
@@ -46,10 +45,12 @@ def parse_info(soup: BeautifulSoup) -> dict:
return { return {
"actor": get_actor(data_dic), "actor": get_actor(data_dic),
"label": get_label(data_dic), "label": get_label(data_dic),
"studio": get_studio(data_dic),
"tag": get_tag(data_dic), "tag": get_tag(data_dic),
"number": get_number(data_dic), "number": get_number(data_dic),
"release": get_release(data_dic), "release": get_release(data_dic),
"runtime": get_runtime(data_dic), "runtime": get_runtime(data_dic),
"series": get_series(data_dic),
} }
else: else:
return {} return {}
@@ -84,6 +85,9 @@ def get_cover(lx: html.HtmlElement) -> str:
def get_outline(lx: html.HtmlElement) -> str: def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0] return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
def get_series2(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
def get_actor(data: hash) -> str: def get_actor(data: hash) -> str:
if "女优" in data: if "女优" in data:
@@ -106,6 +110,13 @@ def get_tag(data: hash) -> str:
return "" return ""
def get_studio(data: hash) -> str:
if "片商" in data:
return get_anchor_info(data["片商"])
else:
return ""
def get_number(data: hash) -> str: def get_number(data: hash) -> str:
if "番号" in data: if "番号" in data:
return get_text_info(data["番号"]) return get_text_info(data["番号"])
@@ -134,5 +145,12 @@ def get_year(data: hash) -> str:
return "" return ""
def get_series(data: hash) -> str:
if "系列" in data:
return get_anchor_info(data["系列"])
else:
return ""
if __name__ == "__main__": if __name__ == "__main__":
print(main("wmc-002")) print(main("soe-259"))

View File

@@ -75,9 +75,12 @@ def getOutline(htmlcode): #获取演员
except: except:
return '' return ''
def getSerise(htmlcode): def getSerise(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) try:
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") html = etree.fromstring(htmlcode, etree.HTMLParser())
return result result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
return result
except:
return ''
def getTag(htmlcode): # 获取演员 def getTag(htmlcode): # 获取演员
tag = [] tag = []
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
@@ -113,6 +116,7 @@ def main_uncensored(number):
'actor_photo': '', 'actor_photo': '',
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py', 'source': 'javbus.py',
'series': getSerise(htmlcode),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
@@ -143,6 +147,7 @@ def main(number):
'actor_photo': getActorPhoto(htmlcode), 'actor_photo': getActorPhoto(htmlcode),
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py', 'source': 'javbus.py',
'series': getSerise(htmlcode),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,
separators=(',', ':'), ) # .encode('UTF-8') separators=(',', ':'), ) # .encode('UTF-8')
@@ -157,3 +162,6 @@ def main(number):
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
) )
return js return js
if __name__ == "__main__" :
print(main('ipx-292'))

View File

@@ -13,8 +13,8 @@ def getTitle(a):
return result return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',') a = actor.split(',')
@@ -25,23 +25,23 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
return d return d
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a): def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+') return str(result2 + result1).strip('+')
def getYear(getRelease): def getYear(getRelease):
try: try:
@@ -51,14 +51,18 @@ def getYear(getRelease):
return getRelease return getRelease
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']") try:
result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']") result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',') return result
except:
result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
return result
def getCover_small(a, index=0): def getCover_small(a, index=0):
# same issue mentioned below, # same issue mentioned below,
# javdb sometime returns multiple results # javdb sometime returns multiple results
@@ -74,17 +78,26 @@ def getCover(htmlcode):
return result return result
def getDirector(a): def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(htmlcode): def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return result return result
def getSeries(a):
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def main(number): def main(number):
try: try:
number = number.upper() number = number.upper()
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') try:
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
except:
query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results, # javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for # and the first elememt maybe not the one we are looking for
@@ -111,6 +124,7 @@ def main(number):
'actor_photo': getActorPhoto(getActor(detail_page)), 'actor_photo': getActorPhoto(getActor(detail_page)),
'website': 'https://javdb.com' + correct_url, 'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py', 'source': 'javdb.py',
'series': getSeries(detail_page),
} }
except Exception as e: except Exception as e:
# print(e) # print(e)
@@ -120,4 +134,5 @@ def main(number):
# main('DV-1562') # main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
#print(main('ipx-292')) if __name__ == "__main__":
print(main('ipx-292'))

View File

@@ -21,8 +21,8 @@ def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/t
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') result1=str(html.xpath('//th[contains(text(),"メーカー")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') result2=str(html.xpath('//th[contains(text(),"メーカー")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','') return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
@@ -55,14 +55,14 @@ def getRelease(a):
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+').replace('/','-')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
@@ -79,6 +79,13 @@ def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '') result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
return result return result
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def main(number2): def main(number2):
number=number2.upper() number=number2.upper()
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
@@ -103,6 +110,7 @@ def main(number2):
'actor_photo': '', 'actor_photo': '',
'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
'source': 'mgstage.py', 'source': 'mgstage.py',
'series': getSeries(a),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js

View File

@@ -70,7 +70,7 @@ def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')).strip(" ['']") result1 = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')).strip(" ['']")
try: try:
return re.findall('\d{4}/\d{2}/\d{2}', result1)[0] return re.findall('\d{4}/\d{2}/\d{2}', result1)[0].replace('/','-')
except: except:
return '' return ''
@@ -122,8 +122,7 @@ def main(number):
try: try:
number = number.upper() number = number.upper()
query_result = get_html( query_result = get_html(
'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-', 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
'') + '&sg=main&num=30')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
detail_page = get_html('https://xcity.jp' + urls) detail_page = get_html('https://xcity.jp' + urls)