update scrapinglib
This commit is contained in:
@@ -3,15 +3,15 @@
|
|||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from scrapinglib.airav import Airav
|
from .airav import Airav
|
||||||
from scrapinglib.carib import Carib
|
from .carib import Carib
|
||||||
from scrapinglib.dlsite import Dlsite
|
from .dlsite import Dlsite
|
||||||
from scrapinglib.fanza import Fanza
|
from .fanza import Fanza
|
||||||
from scrapinglib.gcolle import Gcolle
|
from .gcolle import Gcolle
|
||||||
from scrapinglib.getchu import Getchu
|
from .getchu import Getchu
|
||||||
from scrapinglib.jav321 import Jav321
|
from .jav321 import Jav321
|
||||||
from scrapinglib.javdb import Javdb
|
from .javdb import Javdb
|
||||||
from scrapinglib.mv91 import Mv91
|
from .mv91 import Mv91
|
||||||
from .fc2 import Fc2
|
from .fc2 import Fc2
|
||||||
from .madou import Madou
|
from .madou import Madou
|
||||||
from .mgstage import Mgstage
|
from .mgstage import Mgstage
|
||||||
@@ -19,28 +19,30 @@ from .javbus import Javbus
|
|||||||
from .xcity import Xcity
|
from .xcity import Xcity
|
||||||
from .avsox import Avsox
|
from .avsox import Avsox
|
||||||
|
|
||||||
|
from .tmdb import Tmdb
|
||||||
|
|
||||||
def search(number, souces=None, proxies=None, verify=None, dbcookies=None, dbsite=None, morestoryline=True):
|
|
||||||
"""
|
def search(number, sources: str=None, proxies=None, verify=None, type='adult',
|
||||||
TODO 支持更多网站 douban, imdb,tmdb anidb等
|
dbcookies=None, dbsite=None, morestoryline=False):
|
||||||
type 区分 r18 与 normal
|
""" 根据``番号/电影``名搜索信息
|
||||||
|
|
||||||
|
:param number: number/name depends on type
|
||||||
|
:param sources: sources string with `,` like ``avsox,javbus``
|
||||||
|
:param type: ``adult``, ``general``
|
||||||
"""
|
"""
|
||||||
sc = Scraping()
|
sc = Scraping()
|
||||||
return sc.search(number, souces, proxies=proxies, verify=verify,
|
return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
|
||||||
dbcookies=dbcookies, dbsite=dbsite,
|
dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
|
||||||
morestoryline=morestoryline)
|
|
||||||
|
|
||||||
|
|
||||||
class Scraping():
|
class Scraping():
|
||||||
"""
|
"""
|
||||||
|
|
||||||
只需要获得内容,不经修改
|
只爬取内容,不经修改
|
||||||
|
|
||||||
如果需要翻译等,再针对此方法封装一层
|
如果需要翻译等,再针对此方法封装一层
|
||||||
不做 naming rule 处理,放到封装层,保持内部简介
|
也不做 naming rule 处理
|
||||||
|
|
||||||
可以指定刮削库,可查询当前支持的刮削库
|
可以指定刮削库,可查询当前支持的刮削库
|
||||||
可查询演员多个艺名
|
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
number
|
number
|
||||||
@@ -78,19 +80,33 @@ class Scraping():
|
|||||||
|
|
||||||
proxies = None
|
proxies = None
|
||||||
verify = None
|
verify = None
|
||||||
|
|
||||||
dbcookies = None
|
dbcookies = None
|
||||||
dbsite = None
|
dbsite = None
|
||||||
# 使用storyline方法进一步获取故事情节
|
# 使用storyline方法进一步获取故事情节
|
||||||
morestoryline = True
|
morestoryline = False
|
||||||
|
|
||||||
def search(self, number, sources=None, proxies=None, verify=None,
|
def search(self, number, sources=None, proxies=None, verify=None, type='adult',
|
||||||
dbcookies=None, dbsite=None, morestoryline=True):
|
dbcookies=None, dbsite=None, morestoryline=False):
|
||||||
self.proxies = proxies
|
self.proxies = proxies
|
||||||
self.verify = verify
|
self.verify = verify
|
||||||
self.dbcookies = dbcookies
|
self.dbcookies = dbcookies
|
||||||
self.dbsite = dbsite
|
self.dbsite = dbsite
|
||||||
self.morestoryline = morestoryline
|
self.morestoryline = morestoryline
|
||||||
|
if type == 'adult':
|
||||||
|
return self.searchAdult(number, sources)
|
||||||
|
else:
|
||||||
|
return self.searchGeneral(number, sources)
|
||||||
|
|
||||||
|
def searchGeneral(self, number, sources):
|
||||||
|
""" 查询电影电视剧
|
||||||
|
imdb,tmdb
|
||||||
|
"""
|
||||||
|
data = Tmdb().scrape(number, self)
|
||||||
|
json_data = json.loads(data)
|
||||||
|
return json_data
|
||||||
|
|
||||||
|
def searchAdult(self, number, sources):
|
||||||
sources = self.checkSources(sources, number)
|
sources = self.checkSources(sources, number)
|
||||||
json_data = {}
|
json_data = {}
|
||||||
for source in sources:
|
for source in sources:
|
||||||
@@ -110,7 +126,7 @@ class Scraping():
|
|||||||
print(f"[+]Find movie [{number}] metadata on website '{source}'")
|
print(f"[+]Find movie [{number}] metadata on website '{source}'")
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
break
|
continue
|
||||||
|
|
||||||
# Return if data not found in all sources
|
# Return if data not found in all sources
|
||||||
if not json_data:
|
if not json_data:
|
||||||
@@ -122,9 +138,9 @@ class Scraping():
|
|||||||
|
|
||||||
def checkSources(self, c_sources, file_number):
|
def checkSources(self, c_sources, file_number):
|
||||||
if not c_sources:
|
if not c_sources:
|
||||||
c_sources = self.full_sources
|
sources = self.full_sources
|
||||||
|
else:
|
||||||
sources = c_sources.split(',')
|
sources = c_sources.split(',')
|
||||||
def insert(sources,source):
|
def insert(sources,source):
|
||||||
if source in sources:
|
if source in sources:
|
||||||
sources.insert(0, sources.pop(sources.index(source)))
|
sources.insert(0, sources.pop(sources.index(source)))
|
||||||
@@ -134,33 +150,32 @@ class Scraping():
|
|||||||
# if the input file name matches certain rules,
|
# if the input file name matches certain rules,
|
||||||
# move some web service to the beginning of the list
|
# move some web service to the beginning of the list
|
||||||
lo_file_number = file_number.lower()
|
lo_file_number = file_number.lower()
|
||||||
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
|
if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
|
||||||
):
|
):
|
||||||
sources = insert(sources,"carib")
|
sources = insert(sources,"carib")
|
||||||
elif "item" in file_number:
|
elif "item" in file_number or "GETCHU" in file_number.upper():
|
||||||
sources = insert(sources,"getchu")
|
sources = insert(sources,"getchu")
|
||||||
elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
|
elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
|
||||||
|
sources = insert(sources, "getchu")
|
||||||
|
sources = insert(sources, "dlsite")
|
||||||
|
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
|
||||||
if "avsox" in sources:
|
if "avsox" in sources:
|
||||||
sources = insert(sources,"avsox")
|
sources = insert(sources,"avsox")
|
||||||
elif "mgstage" in sources and \
|
elif "mgstage" in sources and \
|
||||||
(re.match(r"\d+\D+", file_number) or "siro" in lo_file_number):
|
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
|
||||||
sources = insert(sources,"mgstage")
|
sources = insert(sources,"mgstage")
|
||||||
elif "fc2" in lo_file_number:
|
elif "fc2" in lo_file_number:
|
||||||
if "fc2" in sources:
|
if "fc2" in sources:
|
||||||
sources = insert(sources,"fc2")
|
sources = insert(sources,"fc2")
|
||||||
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
|
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
|
||||||
sources = insert(sources,"gcolle")
|
sources = insert(sources,"gcolle")
|
||||||
elif "dlsite" in sources and (
|
elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
|
||||||
"rj" in lo_file_number or "vj" in lo_file_number
|
|
||||||
):
|
|
||||||
sources = insert(sources,"dlsite")
|
|
||||||
elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
|
|
||||||
if "xcity" in sources:
|
if "xcity" in sources:
|
||||||
sources = insert(sources,"xcity")
|
sources = insert(sources,"xcity")
|
||||||
if "madou" in sources:
|
if "madou" in sources:
|
||||||
sources = insert(sources,"madou")
|
sources = insert(sources,"madou")
|
||||||
elif "madou" in sources and (
|
elif "madou" in sources and (
|
||||||
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
|
re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
|
||||||
):
|
):
|
||||||
sources = insert(sources,"madou")
|
sources = insert(sources,"madou")
|
||||||
|
|
||||||
|
|||||||
@@ -2,9 +2,7 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from . import httprequest
|
||||||
from scrapinglib import httprequest
|
|
||||||
|
|
||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
40
scrapinglib/tmdb.py
Normal file
40
scrapinglib/tmdb.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Tmdb(Parser):
|
||||||
|
"""
|
||||||
|
两种实现,带apikey与不带key
|
||||||
|
apikey
|
||||||
|
"""
|
||||||
|
source = 'tmdb'
|
||||||
|
imagecut = 0
|
||||||
|
apikey = None
|
||||||
|
|
||||||
|
expr_title = './/head/meta[@property="og:title"]'
|
||||||
|
expr_release = '//div/span[@class="release"]/text()'
|
||||||
|
expr_cover = './/head/meta[@property="og:image"]'
|
||||||
|
expr_outline = './/head/meta[@property="og:description"]'
|
||||||
|
|
||||||
|
# def search(self, number):
|
||||||
|
# self.detailurl = self.queryNumberUrl(number)
|
||||||
|
# detailpage = self.getHtml(self.detailurl)
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
"""
|
||||||
|
TODO 区分 ID 与 名称
|
||||||
|
"""
|
||||||
|
id = number
|
||||||
|
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
|
||||||
|
return movieUrl
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
return self.getTreeIndex(htmltree, self.expr_title).get('content')
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return "https://www.themoviedb.org" + self.getTreeIndex(htmltree, self.expr_cover).get('content')
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
return self.getTreeIndex(htmltree, self.expr_outline).get('content')
|
||||||
Reference in New Issue
Block a user