diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py
deleted file mode 100644
index d25b8ad..0000000
--- a/WebCrawler/airav.py
+++ /dev/null
@@ -1,227 +0,0 @@
-import sys
-sys.path.append('../')
-from bs4 import BeautifulSoup#need install
-from ADC_function import *
-from WebCrawler import javbus
-
-'''
-API
-注册:https://www.airav.wiki/api/auth/signup
-设置:https://www.airav.wiki/api/get_web_settings
-搜索:https://www.airav.wiki/api/video/list?lng=zh-CN&search=
-搜索:https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search=
-'''
-host = 'https://www.airav.wiki'
-
-# airav这个网站没有演员图片,所以直接使用javbus的图
-def getActorPhoto(javbus_json):
- result = javbus_json.get('actor_photo')
- if isinstance(result, dict) and len(result):
- return result
- return ''
-
-def getTitle(htmlcode): #获取标题
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- title = str(html.xpath('/html/head/title/text()')[0])
- result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
- return result
-
-def getStudio(htmlcode, javbus_json): #获取厂商 已修改
- # javbus如果有数据以它为准
- result = javbus_json.get('studio')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode,etree.HTMLParser())
- return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
-def getYear(htmlcode, javbus_json): #获取年份
- result = javbus_json.get('year')
- if isinstance(result, str) and len(result):
- return result
- release = getRelease(htmlcode, javbus_json)
- if len(release) != len('2000-01-01'):
- return ''
- return release[:4]
-def getCover(htmlcode, javbus_json): #获取封面图片
- result = javbus_json.get('cover')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
-def getRelease(htmlcode, javbus_json): #获取出版日期
- result = javbus_json.get('release')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- try:
- result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
- except:
- return ''
- return result
-def getRuntime(javbus_json): #获取播放时长
- result = javbus_json.get('runtime')
- if isinstance(result, str) and len(result):
- return result
- return ''
-# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先
-def getActor(htmlcode, javbus_json): #获取女优
- b=[]
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
- for v in a:
- v = v.strip()
- if len(v):
- b.append(v)
- if len(b):
- return b
- result = javbus_json.get('actor')
- if isinstance(result, list) and len(result):
- return result
- return []
-def getNum(htmlcode, javbus_json): #获取番号
- result = javbus_json.get('number')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- title = str(html.xpath('/html/head/title/text()')[0])
- result = str(re.findall('^\[(.*?)]', title)[0])
- return result
-def getDirector(javbus_json): #获取导演 已修改
- result = javbus_json.get('director')
- if isinstance(result, str) and len(result):
- return result
- return ''
-def getOutline(htmlcode): #获取概述
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- try:
- result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
- return result
- except:
- return ''
-def getSerise(javbus_json): #获取系列 已修改
- result = javbus_json.get('series')
- if isinstance(result, str) and len(result):
- return result
- return ''
-def getTag(htmlcode): # 获取标签
- tag = []
- soup = BeautifulSoup(htmlcode, 'lxml')
- x = soup.find_all(attrs={'class': 'tagBtnMargin'})
- a = x[0].find_all('a')
-
- for i in a:
- tag.append(i.get_text())
- return tag
-
-def getExtrafanart(htmlcode): # 获取剧照
- html_pather = re.compile(r'
[\s\S]*?
')
- html = html_pather.search(htmlcode)
- if html:
- html = html.group()
- extrafanart_pather = re.compile(r' 0:
- # search_result = {"offset": 0,"count": 4,"result": [
- # {"vid": "99-07-15076","slug": "Wrop6o","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
- # "url": "","view": 98,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15076.jpg","barcode": "_1pondo_012717_472"},
- # {"vid": "99-27-00286","slug": "DlPEua","name": "放課後に、仕込んでください 〜優等生は無言でスカートを捲り上げる〜",
- # "url": "","view": 69,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00286.jpg","barcode": "caribbeancom012717-360"},
- # {"vid": "99-07-15070","slug": "VLS3WY","name": "放課後に、仕込んでください ~優等生は無言でスカートを捲り上げる~ ももき希",
- # "url": "","view": 58,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15070.jpg","barcode": "caribbeancom_012717-360"},
- # {"vid": "99-27-00287","slug": "YdMVb3","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
- # "url": "","view": 56,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00287.jpg","barcode": "1pondo_012717_472"}
- # ],"status": "ok"}
- search_result = get_html(host + '/api/video/list?lang=zh-TW&lng=jp&search=' + keyword + '&page=' + str(page))
-
- try:
- json_data = json.loads(search_result)
- except json.decoder.JSONDecodeError:
- # print("[-]Json decoder error!")
- return []
-
- result_offset = int(json_data["offset"])
- result_count = int(json_data["count"])
- result_size = len(json_data["result"])
- if result_count <= 0 or result_size <= 0:
- return result
- elif result_count > result_offset + result_size: #请求下一页内容
- result.extend(json_data["result"])
- page += 1
- elif result_count == result_offset + result_size: #请求最后一页内容
- result.extend(json_data["result"])
- page = 0
- else:
- page = 0
-
- return result
-
-def main(number):
- try:
- try:
- htmlcode = get_html('https://cn.airav.wiki/video/' + number)
- javbus_json = json.loads(javbus.main(number))
-
- except:
- # print(number)
- pass
-
- dic = {
- # 标题可使用airav
- 'title': getTitle(htmlcode),
- # 制作商先找javbus,如果没有再找本站
- 'studio': getStudio(htmlcode, javbus_json),
- # 年份先试javbus,如果没有再找本站
- 'year': getYear(htmlcode, javbus_json),
- # 简介 使用 airav
- 'outline': getOutline(htmlcode),
- # 使用javbus
- 'runtime': getRuntime(javbus_json),
- # 导演 使用javbus
- 'director': getDirector(javbus_json),
- # 演员 先试airav
- 'actor': getActor(htmlcode, javbus_json),
- # 发售日先试javbus
- 'release': getRelease(htmlcode, javbus_json),
- # 番号使用javbus
- 'number': getNum(htmlcode, javbus_json),
- # 封面链接 使用javbus
- 'cover': getCover(htmlcode, javbus_json),
- # 剧照获取
- 'extrafanart': getExtrafanart(htmlcode),
- 'imagecut': 1,
- # 使用 airav
- 'tag': getTag(htmlcode),
- # 使用javbus
- 'label': getSerise(javbus_json),
- 'actor_photo': getActorPhoto(javbus_json),
- 'website': 'https://www.airav.wiki/video/' + number,
- 'source': 'airav.py',
- # 使用javbus
- 'series': getSerise(javbus_json)
- }
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
- return js
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- data = {
- "title": "",
- }
- js = json.dumps(
- data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
- )
- return js
-
-
-if __name__ == '__main__':
- config.getInstance().set_override("actor_photo:download_for_kodi=1")
- config.getInstance().set_override("debug_mode:switch=1")
- print(main('ADV-R0624')) # javbus页面返回404, airav有数据
- print(main('ADN-188')) # 一人
- print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字
diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py
deleted file mode 100644
index a18eab6..0000000
--- a/WebCrawler/avsox.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import sys
-sys.path.append('..')
-from ADC_function import *
-from WebCrawler.storyline import getStoryline
-from WebCrawler.crawler import *
-# import io
-# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
-
-def getActorPhoto(html):
- a = html.xpath('//a[@class="avatar-box"]')
- d = {}
- for i in a:
- l = i.find('.//img').attrib['src']
- t = i.find('span').text
- p2 = {t: l}
- d.update(p2)
- return d
-
-def getActor(html):
- a = html.xpath('//a[@class="avatar-box"]')
- d = []
- for i in a:
- d.append(i.find('span').text)
- return d
-
-def getCover_small(html):
- result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
- return result
-def getTag(html):
- x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
- return [i.strip() for i in x[2:]] if len(x) > 2 else []
-
-def main(number):
- html = get_html('https://tellme.pw/avsox')
- site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
- a = get_html(site + '/cn/search/' + number)
- html = Crawler(a)
- result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
- if result1 == '' or result1 == 'null' or result1 == 'None':
- a = get_html(site + '/cn/search/' + number.replace('-', '_'))
- html = Crawler(a)
- result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
- if result1 == '' or result1 == 'null' or result1 == 'None':
- a = get_html(site + '/cn/search/' + number.replace('_', ''))
- html = Crawler(a)
- result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
- detail = get_html("https:" + result1)
- lx = etree.fromstring(detail, etree.HTMLParser())
- avsox_crawler2 = Crawler(a)
- avsox_crawler = Crawler(detail)
- try:
- new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
- if new_number.upper() != number.upper():
- raise ValueError('number not found')
- title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
- dic = {
- 'actor': getActor(lx),
- 'title': title,
- 'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
- 'outline': getStoryline(number, title),
- 'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
- 'director': '', #
- 'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
- 'number': new_number,
- 'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
- #'cover_small' : getCover_small(html),
- 'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
- 'imagecut': 3,
- 'tag': getTag(lx),
- 'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
- 'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
- 'actor_photo': getActorPhoto(lx),
- 'website': "https:" + result1,
- 'source': 'avsox.py',
- 'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
- }
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- dic = {"title": ""}
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
- return js
-
-if __name__ == "__main__":
- print(main('012717_472'))
- print(main('1')) # got fake result raise 'number not found'
diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py
deleted file mode 100755
index 50cbcc1..0000000
--- a/WebCrawler/carib.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import sys
-sys.path.append('../')
-from lxml import html
-from ADC_function import *
-from WebCrawler.storyline import getStoryline
-
-
-G_SITE = 'https://www.caribbeancom.com'
-
-
-def main(number: str) -> json:
- try:
- url = f'{G_SITE}/moviepages/{number}/index.html'
- result, session = get_html_session(url, return_type='session')
- htmlcode = result.content.decode('euc-jp')
- if not result or not htmlcode or '404' in htmlcode or 'class="movie-info section"' not in htmlcode:
- raise ValueError("page not found")
-
- lx = html.fromstring(htmlcode)
- title = get_title(lx)
-
- dic = {
- 'title': title,
- 'studio': '加勒比',
- 'year': get_year(lx),
- 'outline': get_outline(lx, number, title),
- 'runtime': get_runtime(lx),
- 'director': '',
- 'actor': get_actor(lx),
- 'release': get_release(lx),
- 'number': number,
- 'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg',
- 'tag': get_tag(lx),
- 'extrafanart': get_extrafanart(lx),
- 'label': get_series(lx),
- 'imagecut': 1,
- 'website': f'{G_SITE}/moviepages/{number}/index.html',
- 'source': 'carib.py',
- 'series': get_series(lx),
- '无码': True
- }
- if config.getInstance().download_actor_photo_for_kodi():
- dic['actor_photo'] = get_actor_photo(lx, session)
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
- return js
-
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- dic = {"title": ""}
- return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-
-
-def get_title(lx: html.HtmlElement) -> str:
- return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
-
-def get_year(lx: html.HtmlElement) -> str:
- return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
-
-def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
- o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
- g = getStoryline(number, title, 无码=True)
- if len(g):
- return g
- return o
-
-def get_release(lx: html.HtmlElement) -> str:
- return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
-
-def get_actor(lx: html.HtmlElement):
- r = []
- actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
- for act in actors:
- if str(act) != '他':
- r.append(act)
- return r
-
-def get_tag(lx: html.HtmlElement) -> str:
- genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
- return genres
-
-def get_extrafanart(lx: html.HtmlElement) -> str:
- r = []
- genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
- for g in genres:
- jpg = str(g)
- if '/member/' in jpg:
- break
- else:
- r.append('https://www.caribbeancom.com' + jpg)
- return r
-
-def get_series(lx: html.HtmlElement) -> str:
- try:
- return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
- except:
- return ''
-
-def get_runtime(lx: html.HtmlElement) -> str:
- return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
-
-def get_actor_photo(lx, session):
- htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
- names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
- t = {}
- for name, a in zip(names, htmla):
- if name.strip() == '他':
- continue
- p = {name.strip(): a.attrib['href']}
- t.update(p)
- o = {}
- for k, v in t.items():
- if '/search_act/' not in v:
- continue
- r = session.get(urljoin(G_SITE, v))
- if not r.ok:
- continue
- html = r.text
- pos = html.find('.full-bg')
- if pos<0:
- continue
- css = html[pos:pos+100]
- cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
- if not cssBGjpgs or not len(cssBGjpgs[0]):
- continue
- p = {k: urljoin(r.url, cssBGjpgs[0])}
- o.update(p)
- return o
-
-if __name__ == "__main__":
- print(main("070116-197")) # actor have photo
- print(main("041721-001"))
- print(main("080520-001"))
diff --git a/WebCrawler/crawler.py b/WebCrawler/crawler.py
deleted file mode 100644
index e6176b6..0000000
--- a/WebCrawler/crawler.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from lxml import etree
-
-class Crawler:
- def __init__(self,htmlcode):
- self.html = etree.HTML(htmlcode)
-
- def getString(self,_xpath):
- if _xpath == "":
- return ""
- result = self.html.xpath(_xpath)
- try:
- return result[0]
- except:
- return ""
-
- def getStrings(self,_xpath):
- result = self.html.xpath(_xpath)
- try:
- return result
- except:
- return ""
-
- def getOutline(self,_xpath):
- result = self.html.xpath(_xpath)
- try:
- return "\n".join(result)
- except:
- return ""
\ No newline at end of file
diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py
deleted file mode 100644
index 54ed6f7..0000000
--- a/WebCrawler/dlsite.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import re
-from lxml import etree
-import json
-import sys
-sys.path.append('../')
-from ADC_function import *
-
-def getTitle(html):
- result = str(html.xpath('/html/head/title/text()')[0])
- result = result[:result.rfind(' | DLsite')]
- result = result[:result.rfind(' [')]
- result = result.replace('【HD版】', '')
- return result
-def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
- try:
- result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
- except:
- result1 = ''
- return result1
-def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
- a = actor.split(',')
- d={}
- for i in a:
- p={i:''}
- d.update(p)
- return d
-def getStudio(html):
- try:
- try:
- result = html.xpath('//th[contains(text(),"商标名")]/../td/span[1]/a/text()')[0]
- except:
- result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
- except:
- result = ''
- return result
-def getRuntime(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
- return str(result1 + result2).strip('+').rstrip('mi')
-def getLabel(html):
- try:
- try:
- result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
- except:
- result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
- except:
- result = ''
- return result
-def getYear(getRelease):
- try:
- result = str(re.search('\d{4}', getRelease).group())
- return result
- except:
- return getRelease
-def getRelease(html):
- result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
- return result1.replace('年','-').replace('月','-').replace('日','')
-def getTag(html):
- try:
- result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
- return result
- except:
- return ''
-
-def getCover_small(a, index=0):
- # same issue mentioned below,
- # javdb sometime returns multiple results
- # DO NOT just get the firt one, get the one with correct index number
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- try:
- result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
- if not 'https' in result:
- result = 'https:' + result
- return result
- except: # 2020.7.17 Repair Cover Url crawl
- result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
- if not 'https' in result:
- result = 'https:' + result
- return result
-def getCover(html):
- result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
- return result.replace('.webp', '.jpg')
-def getDirector(html):
- try:
- result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
- except:
- result = ''
- return result
-def getOutline(html):
- total = []
- result = html.xpath('//*[@class="work_parts_area"]/p/text()')
- for i in result:
- total.append(i.strip('\r\n'))
- return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
-def getSeries(html):
- try:
- try:
- result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
- except:
- result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
- except:
- result = ''
- return result
-#
-def getExtrafanart(html):
- try:
- result = []
- for i in html.xpath('//*[@id="work_left"]/div/div/div[1]/div/@data-src'):
- result.append("https:" + i)
- except:
- result = ''
- return result
-def main(number):
- try:
- if "RJ" in number or "VJ" in number:
- number = number.upper()
- htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN', cookies={'locale': 'zh-cn'})
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- else:
- htmlcode = get_html(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})
- html = etree.HTML(htmlcode)
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- if len(search_result) == 0:
- number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
- html = etree.HTML(get_html(
- f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- if len(search_result) == 0:
- if "~" in number:
- number = number.replace("~","〜")
- elif "〜" in number:
- number = number.replace("〜","~")
- html = etree.HTML(get_html(
- f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- if len(search_result) == 0:
- number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
- html = etree.HTML(get_html(
- f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- a = search_result[0]
- html = etree.HTML(get_html(a,cookies={'locale': 'zh-cn'}))
- number = str(re.findall("\wJ\w+",a)).strip(" [']")
- dic = {
- 'actor': getStudio(html),
- 'title': getTitle(html),
- 'studio': getStudio(html),
- 'outline': getOutline(html),
- 'runtime': '',
- 'director': getDirector(html),
- 'release': getRelease(html),
- 'number': number,
- 'cover': 'https:' + getCover(html),
- 'cover_small': '',
- 'imagecut': 4,
- 'tag': getTag(html),
- 'label': getLabel(html),
- 'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
- 'actor_photo': '',
- 'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
- 'source': 'dlsite.py',
- 'series': getSeries(html),
- 'extrafanart':getExtrafanart(html),
- 'allow_number_change':True,
- }
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
- return js
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- data = {
- "title": "",
- }
- js = json.dumps(
- data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
- )
- return js
-
-# main('DV-1562')
-# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
-if __name__ == "__main__":
- config.getInstance().set_override("debug_mode:switch=1")
- print(main('牝教師4~穢された教壇~ 「生意気ドジっ娘女教師・美結~高飛車ハメ堕ち2濁金」'))
- print(main('RJ329607'))
diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py
deleted file mode 100644
index 38a919a..0000000
--- a/WebCrawler/fanza.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import sys
-sys.path.append('../')
-from urllib.parse import urlencode
-
-from ADC_function import *
-from WebCrawler.crawler import *
-
-class fanzaCrawler(Crawler):
- def getFanzaString(self,string):
- result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
- result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
- return result1+result2
-
- def getFanzaStrings(self, string):
- result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
- if len(result1) > 0:
- return result1
- result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
- return result2
-
-
-def getRelease(fanza_Crawler):
- result = fanza_Crawler.getFanzaString('発売日:')
- if result == '' or result == '----':
- result = fanza_Crawler.getFanzaString('配信開始日:')
- return result.replace("/", "-").strip('\\n')
-
-
-def getCover(html, number):
- cover_number = number
- try:
- result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
- except:
- # sometimes fanza modify _ to \u0005f for image id
- if "_" in cover_number:
- cover_number = cover_number.replace("_", r"\u005f")
- try:
- result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
- except:
- # (TODO) handle more edge case
- # print(html)
- # raise exception here, same behavior as before
- # people's major requirement is fetching the picture
- raise ValueError("can not find image")
- return result
-
-
-def getOutline(html):
- try:
- result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
- if result == "":
- result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
- except:
- # (TODO) handle more edge case
- # print(html)
- return ""
- return result
-
-
-def getExtrafanart(htmlcode): # 获取剧照
- html_pather = re.compile(r'\n')
- html = html_pather.search(htmlcode)
- if html:
- html = html.group()
- extrafanart_pather = re.compile(r'')
- html = html_pather.search(htmlcode)
- if html:
- html = html.group()
- extrafanart_pather = re.compile(r' json:
- try:
- result = post_html(url="https://www.jav321.com/search", query={"sn": number})
- soup = BeautifulSoup(result.text, "html.parser")
- lx = html.fromstring(str(soup))
- except:
- dic = {"title": ""}
- return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-
- if "/video/" in result.url:
- data = parse_info(soup)
-
- dic = {
- "title": get_title(lx),
- "year": get_year(data),
- "outline": get_outline(lx),
- "director": "",
- "cover": get_cover(lx),
- "imagecut": 1,
- "trailer": get_trailer(result.text),
- "extrafanart": get_extrafanart(result.text),
- "actor_photo": "",
- "website": result.url,
- "source": "jav321.py",
- **data,
- }
- else:
- dic = {"title": ""}
-
- return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-
-def get_title(lx: html.HtmlElement) -> str:
- return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
-
-
-def parse_info(soup: BeautifulSoup) -> dict:
- data = soup.select_one("div.row > div.col-md-9")
-
- if data:
- dd = str(data).split("
")
- data_dic = {}
- for d in dd:
- data_dic[get_bold_text(h=d)] = d
-
- return {
- "actor": get_actor(data_dic),
- "label": get_label(data_dic),
- "studio": get_studio(data_dic),
- "tag": get_tag(data_dic),
- "number": get_number(data_dic).upper(),
- "release": get_release(data_dic),
- "runtime": get_runtime(data_dic).replace(" minutes", ""),
- "series": get_series(data_dic),
- }
- else:
- return {"title": ""}
-
-
-def get_bold_text(h: str) -> str:
- soup = BeautifulSoup(h, "html.parser")
- if soup.b:
- return soup.b.text
- else:
- return "UNKNOWN_TAG"
-
-
-def get_anchor_info(h: str) -> str:
- result = []
-
- data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
- for d in data:
- result.append(d.text)
-
- return ",".join(result)
-
-
-def get_text_info(h: str) -> str:
- return h.split(": ")[1]
-
-def get_trailer(html) -> str:
- videourl_pather = re.compile(r'[\s\S]*?