update scrapinglib

This commit is contained in:
Mathhew
2022-05-30 15:05:08 +08:00
parent feccd67115
commit e665bceb5b
3 changed files with 93 additions and 40 deletions

View File

@@ -3,15 +3,15 @@
import re import re
import json import json
from scrapinglib.airav import Airav from .airav import Airav
from scrapinglib.carib import Carib from .carib import Carib
from scrapinglib.dlsite import Dlsite from .dlsite import Dlsite
from scrapinglib.fanza import Fanza from .fanza import Fanza
from scrapinglib.gcolle import Gcolle from .gcolle import Gcolle
from scrapinglib.getchu import Getchu from .getchu import Getchu
from scrapinglib.jav321 import Jav321 from .jav321 import Jav321
from scrapinglib.javdb import Javdb from .javdb import Javdb
from scrapinglib.mv91 import Mv91 from .mv91 import Mv91
from .fc2 import Fc2 from .fc2 import Fc2
from .madou import Madou from .madou import Madou
from .mgstage import Mgstage from .mgstage import Mgstage
@@ -19,28 +19,30 @@ from .javbus import Javbus
from .xcity import Xcity from .xcity import Xcity
from .avsox import Avsox from .avsox import Avsox
from .tmdb import Tmdb
def search(number, souces=None, proxies=None, verify=None, dbcookies=None, dbsite=None, morestoryline=True):
""" def search(number, sources: str=None, proxies=None, verify=None, type='adult',
TODO 支持更多网站 douban, imdb,tmdb anidb等 dbcookies=None, dbsite=None, morestoryline=False):
type 区分 r18 与 normal """ 根据``番号/电影``名搜索信息
:param number: number/name depends on type
:param sources: sources string with `,` like ``avsox,javbus``
:param type: ``adult``, ``general``
""" """
sc = Scraping() sc = Scraping()
return sc.search(number, souces, proxies=proxies, verify=verify, return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
dbcookies=dbcookies, dbsite=dbsite, dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
morestoryline=morestoryline)
class Scraping(): class Scraping():
""" """
需要获得内容,不经修改 爬取内容,不经修改
如果需要翻译等,再针对此方法封装一层 如果需要翻译等,再针对此方法封装一层
不做 naming rule 处理,放到封装层,保持内部简介 不做 naming rule 处理
可以指定刮削库,可查询当前支持的刮削库 可以指定刮削库,可查询当前支持的刮削库
可查询演员多个艺名
参数: 参数:
number number
@@ -78,19 +80,33 @@ class Scraping():
proxies = None proxies = None
verify = None verify = None
dbcookies = None dbcookies = None
dbsite = None dbsite = None
# 使用storyline方法进一步获取故事情节 # 使用storyline方法进一步获取故事情节
morestoryline = True morestoryline = False
def search(self, number, sources=None, proxies=None, verify=None, def search(self, number, sources=None, proxies=None, verify=None, type='adult',
dbcookies=None, dbsite=None, morestoryline=True): dbcookies=None, dbsite=None, morestoryline=False):
self.proxies = proxies self.proxies = proxies
self.verify = verify self.verify = verify
self.dbcookies = dbcookies self.dbcookies = dbcookies
self.dbsite = dbsite self.dbsite = dbsite
self.morestoryline = morestoryline self.morestoryline = morestoryline
if type == 'adult':
return self.searchAdult(number, sources)
else:
return self.searchGeneral(number, sources)
def searchGeneral(self, number, sources):
""" 查询电影电视剧
imdb,tmdb
"""
data = Tmdb().scrape(number, self)
json_data = json.loads(data)
return json_data
def searchAdult(self, number, sources):
sources = self.checkSources(sources, number) sources = self.checkSources(sources, number)
json_data = {} json_data = {}
for source in sources: for source in sources:
@@ -110,7 +126,7 @@ class Scraping():
print(f"[+]Find movie [{number}] metadata on website '{source}'") print(f"[+]Find movie [{number}] metadata on website '{source}'")
break break
except: except:
break continue
# Return if data not found in all sources # Return if data not found in all sources
if not json_data: if not json_data:
@@ -122,9 +138,9 @@ class Scraping():
def checkSources(self, c_sources, file_number): def checkSources(self, c_sources, file_number):
if not c_sources: if not c_sources:
c_sources = self.full_sources sources = self.full_sources
else:
sources = c_sources.split(',') sources = c_sources.split(',')
def insert(sources,source): def insert(sources,source):
if source in sources: if source in sources:
sources.insert(0, sources.pop(sources.index(source))) sources.insert(0, sources.pop(sources.index(source)))
@@ -134,33 +150,32 @@ class Scraping():
# if the input file name matches certain rules, # if the input file name matches certain rules,
# move some web service to the beginning of the list # move some web service to the beginning of the list
lo_file_number = file_number.lower() lo_file_number = file_number.lower()
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number) if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
): ):
sources = insert(sources,"carib") sources = insert(sources,"carib")
elif "item" in file_number: elif "item" in file_number or "GETCHU" in file_number.upper():
sources = insert(sources,"getchu") sources = insert(sources,"getchu")
elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number: elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
sources = insert(sources, "getchu")
sources = insert(sources, "dlsite")
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
if "avsox" in sources: if "avsox" in sources:
sources = insert(sources,"avsox") sources = insert(sources,"avsox")
elif "mgstage" in sources and \ elif "mgstage" in sources and \
(re.match(r"\d+\D+", file_number) or "siro" in lo_file_number): (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
sources = insert(sources,"mgstage") sources = insert(sources,"mgstage")
elif "fc2" in lo_file_number: elif "fc2" in lo_file_number:
if "fc2" in sources: if "fc2" in sources:
sources = insert(sources,"fc2") sources = insert(sources,"fc2")
elif "gcolle" in sources and (re.search("\d{6}", file_number)): elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources = insert(sources,"gcolle") sources = insert(sources,"gcolle")
elif "dlsite" in sources and ( elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
"rj" in lo_file_number or "vj" in lo_file_number
):
sources = insert(sources,"dlsite")
elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
if "xcity" in sources: if "xcity" in sources:
sources = insert(sources,"xcity") sources = insert(sources,"xcity")
if "madou" in sources: if "madou" in sources:
sources = insert(sources,"madou") sources = insert(sources,"madou")
elif "madou" in sources and ( elif "madou" in sources and (
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number) re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
): ):
sources = insert(sources,"madou") sources = insert(sources,"madou")

View File

@@ -2,9 +2,7 @@
import re import re
from lxml import etree from lxml import etree
from . import httprequest
from scrapinglib import httprequest
from .parser import Parser from .parser import Parser

40
scrapinglib/tmdb.py Normal file
View File

@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
from .parser import Parser
class Tmdb(Parser):
"""
两种实现,带apikey与不带key
apikey
"""
source = 'tmdb'
imagecut = 0
apikey = None
expr_title = './/head/meta[@property="og:title"]'
expr_release = '//div/span[@class="release"]/text()'
expr_cover = './/head/meta[@property="og:image"]'
expr_outline = './/head/meta[@property="og:description"]'
# def search(self, number):
# self.detailurl = self.queryNumberUrl(number)
# detailpage = self.getHtml(self.detailurl)
def queryNumberUrl(self, number):
"""
TODO 区分 ID 与 名称
"""
id = number
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
return movieUrl
def getTitle(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_title).get('content')
def getCover(self, htmltree):
return "https://www.themoviedb.org" + self.getTreeIndex(htmltree, self.expr_cover).get('content')
def getOutline(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_outline).get('content')