diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index a117059..8ee81c2 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -86,7 +86,7 @@ def create_data_and_move(file_path: str, c: config.Config):
if __name__ == '__main__':
- version = '3.4.3'
+ version = '3.5'
# Parse command line args
single_file_path, config_file, auto_exit = argparse_function()
diff --git a/avsox.py b/avsox.py
index e54d8d1..c3d0b6a 100644
--- a/avsox.py
+++ b/avsox.py
@@ -72,6 +72,13 @@ def getTag(a): # 获取演员
for i in a:
d.append(i.get_text())
return d
+def getSeries(htmlcode):
+ try:
+ html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
+ return result1
+ except:
+ return ''
def main(number):
a = get_html('https://avsox.host/cn/search/' + number)
@@ -108,8 +115,10 @@ def main(number):
'actor_photo': getActorPhoto(web),
'website': result1,
'source': 'avsox.py',
+ 'series': getSeries(info),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
-#print(main('012717_472'))
\ No newline at end of file
+if __name__ == "__main__":
+ print(main('012717_472'))
\ No newline at end of file
diff --git a/core.py b/core.py
index b1a4725..bfb1a42 100755
--- a/core.py
+++ b/core.py
@@ -98,6 +98,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON
runtime = json_data['runtime']
outline = json_data['outline']
label = json_data['label']
+ series = json_data['series']
year = json_data['year']
try:
cover_small = json_data['cover_small']
@@ -166,7 +167,8 @@ def get_info(json_data): # 返回json里的数据
number = json_data['number']
cover = json_data['cover']
website = json_data['website']
- return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website
+ series = json_data['series']
+ return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series
def small_cover_check(path, number, cover_small, c_word, conf: config.Config, filepath, failed_folder):
@@ -263,7 +265,7 @@ def image_download(cover, number, c_word, path, conf: config.Config, filepath, f
def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, failed_folder, tag, actor_list, liuchu):
- title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website = get_info(json_data)
+ title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series = get_info(json_data)
try:
if not os.path.exists(path):
@@ -300,6 +302,7 @@ def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, fa
try:
for i in tag:
print(" " + i + "", file=code)
+ print(" " + series + "", file=code)
except:
aaaaa = ''
try:
diff --git a/fanza.py b/fanza.py
index fe4c7f4..d33abc7 100644
--- a/fanza.py
+++ b/fanza.py
@@ -108,7 +108,7 @@ def getRelease(text):
)[0].lstrip("\n")
except:
pass
- return result
+ return result.replace('/','-')
def getTag(text):
@@ -174,6 +174,23 @@ def getOutline(text):
return result
+def getSeries(text):
+ try:
+ html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ try:
+ result = html.xpath(
+ "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
+ )[0]
+ except:
+ result = html.xpath(
+ "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
+ )[0]
+ return result
+ except:
+ return ''
+
+
+
def main(number):
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
@@ -225,6 +242,7 @@ def main(number):
"actor_photo": "",
"website": chosen_url,
"source": "fanza.py",
+ "series": getSeries(htmlcode),
}
except:
data = {
@@ -266,7 +284,4 @@ def main_htmlcode(number):
if __name__ == "__main__":
- # print(main("DV-1562"))
- # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
- # print(main("ipx292"))
- pass
+ print(main("DV-1562"))
\ No newline at end of file
diff --git a/fc2fans_club.py b/fc2fans_club.py
index d24cb3f..2cf5c1e 100755
--- a/fc2fans_club.py
+++ b/fc2fans_club.py
@@ -159,5 +159,5 @@ def main(number):
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
return js
-
-#print(main('1252953'))
+if __name__ == '__main__':
+ print(main('1252953'))
\ No newline at end of file
diff --git a/jav321.py b/jav321.py
index 9e0665c..7b0baae 100644
--- a/jav321.py
+++ b/jav321.py
@@ -13,7 +13,6 @@ def main(number: str) -> json:
data = parse_info(soup)
dic = {
"title": get_title(lx),
- "studio": "",
"year": get_year(data),
"outline": get_outline(lx),
"director": "",
@@ -46,10 +45,12 @@ def parse_info(soup: BeautifulSoup) -> dict:
return {
"actor": get_actor(data_dic),
"label": get_label(data_dic),
+ "studio": get_studio(data_dic),
"tag": get_tag(data_dic),
"number": get_number(data_dic),
"release": get_release(data_dic),
"runtime": get_runtime(data_dic),
+ "series": get_series(data_dic),
}
else:
return {}
@@ -84,6 +85,9 @@ def get_cover(lx: html.HtmlElement) -> str:
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
+def get_series2(lx: html.HtmlElement) -> str:
+ return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
+
def get_actor(data: hash) -> str:
if "女优" in data:
@@ -106,6 +110,13 @@ def get_tag(data: hash) -> str:
return ""
+def get_studio(data: hash) -> str:
+ if "片商" in data:
+ return get_anchor_info(data["片商"])
+ else:
+ return ""
+
+
def get_number(data: hash) -> str:
if "番号" in data:
return get_text_info(data["番号"])
@@ -134,5 +145,12 @@ def get_year(data: hash) -> str:
return ""
+def get_series(data: hash) -> str:
+ if "系列" in data:
+ return get_anchor_info(data["系列"])
+ else:
+ return ""
+
+
if __name__ == "__main__":
- print(main("wmc-002"))
+ print(main("soe-259"))
diff --git a/javbus.py b/javbus.py
index 9f77a25..7d51a4d 100755
--- a/javbus.py
+++ b/javbus.py
@@ -75,9 +75,12 @@ def getOutline(htmlcode): #获取演员
except:
return ''
def getSerise(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
- return result
+ try:
+ html = etree.fromstring(htmlcode, etree.HTMLParser())
+ result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
+ return result
+ except:
+ return ''
def getTag(htmlcode): # 获取演员
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
@@ -113,6 +116,7 @@ def main_uncensored(number):
'actor_photo': '',
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
+ 'series': getSerise(htmlcode),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -143,6 +147,7 @@ def main(number):
'actor_photo': getActorPhoto(htmlcode),
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
+ 'series': getSerise(htmlcode),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,
separators=(',', ':'), ) # .encode('UTF-8')
@@ -157,3 +162,6 @@ def main(number):
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
+
+if __name__ == "__main__" :
+ print(main('ipx-292'))
diff --git a/javdb.py b/javdb.py
index 31a4e63..ce19601 100755
--- a/javdb.py
+++ b/javdb.py
@@ -13,8 +13,8 @@ def getTitle(a):
return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']")
+ result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',')
@@ -25,23 +25,23 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
+ result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']")
+ result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']")
+ result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser())
- result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']")
+ result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+')
def getYear(getRelease):
try:
@@ -51,14 +51,18 @@ def getYear(getRelease):
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']")
+ result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']")
- return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',')
+ try:
+ result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
+ return result
+ except:
+ result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
+ return result
+
def getCover_small(a, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
@@ -74,17 +78,26 @@ def getCover(htmlcode):
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']")
+ result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return result
+def getSeries(a):
+ #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
+ html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
+ result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
+ return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def main(number):
try:
number = number.upper()
- query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
+ try:
+ query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
+ except:
+ query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for
@@ -111,6 +124,7 @@ def main(number):
'actor_photo': getActorPhoto(getActor(detail_page)),
'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py',
+ 'series': getSeries(detail_page),
}
except Exception as e:
# print(e)
@@ -120,4 +134,5 @@ def main(number):
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
-#print(main('ipx-292'))
+if __name__ == "__main__":
+ print(main('ipx-292'))
diff --git a/mgstage.py b/mgstage.py
index 9685619..2c6391b 100755
--- a/mgstage.py
+++ b/mgstage.py
@@ -21,8 +21,8 @@ def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/t
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
- result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
- result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
+ result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
+ result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
@@ -55,14 +55,14 @@ def getRelease(a):
'\\n')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
- return str(result1 + result2).strip('+')
+ return str(result1 + result2).strip('+').replace('/','-')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
- return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','')
+ return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
@@ -79,6 +79,13 @@ def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
return result
+def getSeries(a):
+ html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
+ '\\n')
+ result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
+ '\\n')
+ return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def main(number2):
number=number2.upper()
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
@@ -103,6 +110,7 @@ def main(number2):
'actor_photo': '',
'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
'source': 'mgstage.py',
+ 'series': getSeries(a),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
diff --git a/xcity.py b/xcity.py
index 3ea001d..0a12ad8 100644
--- a/xcity.py
+++ b/xcity.py
@@ -70,7 +70,7 @@ def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')).strip(" ['']")
try:
- return re.findall('\d{4}/\d{2}/\d{2}', result1)[0]
+ return re.findall('\d{4}/\d{2}/\d{2}', result1)[0].replace('/','-')
except:
return ''
@@ -122,8 +122,7 @@ def main(number):
try:
number = number.upper()
query_result = get_html(
- 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-',
- '') + '&sg=main&num=30')
+ 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
detail_page = get_html('https://xcity.jp' + urls)