Add crawler named gcolle.py
This commit is contained in:
@@ -24,6 +24,7 @@ from . import carib
|
|||||||
from . import fc2club
|
from . import fc2club
|
||||||
from . import mv91
|
from . import mv91
|
||||||
from . import madou
|
from . import madou
|
||||||
|
from . import gcolle
|
||||||
|
|
||||||
|
|
||||||
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
|
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
|
||||||
@@ -62,7 +63,8 @@ def get_data_from_json(file_number, oCC):
|
|||||||
"carib": carib.main,
|
"carib": carib.main,
|
||||||
"fc2club": fc2club.main,
|
"fc2club": fc2club.main,
|
||||||
"mv91": mv91.main,
|
"mv91": mv91.main,
|
||||||
"madou": madou.main
|
"madou": madou.main,
|
||||||
|
"gcolle": gcolle.main,
|
||||||
}
|
}
|
||||||
|
|
||||||
conf = config.getInstance()
|
conf = config.getInstance()
|
||||||
@@ -75,6 +77,8 @@ def get_data_from_json(file_number, oCC):
|
|||||||
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
|
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
|
||||||
):
|
):
|
||||||
sources.insert(0, sources.pop(sources.index("carib")))
|
sources.insert(0, sources.pop(sources.index("carib")))
|
||||||
|
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
|
||||||
|
sources.insert(0, sources.pop(sources.index("gcolle")))
|
||||||
elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
|
elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
|
||||||
if "javdb" in sources:
|
if "javdb" in sources:
|
||||||
sources.insert(0, sources.pop(sources.index("javdb")))
|
sources.insert(0, sources.pop(sources.index("javdb")))
|
||||||
|
|||||||
99
WebCrawler/gcolle.py
Normal file
99
WebCrawler/gcolle.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
from ADC_function import *
|
||||||
|
from lxml import etree
|
||||||
|
from requests_html import HTMLSession
|
||||||
|
|
||||||
|
class Crawler:
|
||||||
|
def __init__(self,htmlcode):
|
||||||
|
self.html = etree.HTML(htmlcode)
|
||||||
|
|
||||||
|
def getString(self,_xpath):
|
||||||
|
if _xpath == "":
|
||||||
|
return ""
|
||||||
|
result = self.html.xpath(_xpath)
|
||||||
|
try:
|
||||||
|
return result[0]
|
||||||
|
except:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def getStrings(self,_xpath):
|
||||||
|
result = self.html.xpath(_xpath)
|
||||||
|
try:
|
||||||
|
return result
|
||||||
|
except:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def getOutline(self,_xpath):
|
||||||
|
result = self.html.xpath(_xpath)
|
||||||
|
try:
|
||||||
|
return "\n".join(result)
|
||||||
|
except:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def main(number):
|
||||||
|
config_file = config.getInstance()
|
||||||
|
browser = HTMLSession()
|
||||||
|
|
||||||
|
number = number.upper().replace('GCOLLE-','')
|
||||||
|
|
||||||
|
htmlcode = get_html_requests_html(browser,'https://gcolle.net/product_info.php/products_id/' + number)
|
||||||
|
html = etree.HTML(htmlcode)
|
||||||
|
# R18 countinue
|
||||||
|
htmlcode = get_html_requests_html(browser,html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0])
|
||||||
|
gcolle_crawler = Crawler(htmlcode)
|
||||||
|
|
||||||
|
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
|
||||||
|
if number != number_html:
|
||||||
|
if config_file.debug():
|
||||||
|
print('[!]gcolle.py: number not match')
|
||||||
|
return {'title':''}
|
||||||
|
|
||||||
|
# get extrafanart url
|
||||||
|
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
|
||||||
|
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
|
||||||
|
else:
|
||||||
|
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
|
||||||
|
# Add "https:" in each extrafanart url
|
||||||
|
for i in range(len(extrafanart)):
|
||||||
|
extrafanart[i] = 'https:' + extrafanart[i]
|
||||||
|
|
||||||
|
dic = {
|
||||||
|
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'),
|
||||||
|
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||||
|
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
|
||||||
|
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
|
||||||
|
"runtime": '',
|
||||||
|
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||||
|
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||||
|
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
|
||||||
|
"number": "GCOLLE-" + str(number_html),
|
||||||
|
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
|
||||||
|
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
|
||||||
|
"trailer": '',
|
||||||
|
"actor_photo":'',
|
||||||
|
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
|
||||||
|
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
|
||||||
|
"extrafanart":extrafanart,
|
||||||
|
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||||
|
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
|
||||||
|
"source": 'gcolle.py',
|
||||||
|
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
||||||
|
'无码': False,
|
||||||
|
}
|
||||||
|
# for k,v in dic.items():
|
||||||
|
# if k == 'outline':
|
||||||
|
# print(k,len(v))
|
||||||
|
# else:
|
||||||
|
# print(k,v)
|
||||||
|
# print('===============================================================')
|
||||||
|
return dic
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main('840724')
|
||||||
|
main('840386')
|
||||||
|
main('838671')
|
||||||
|
main('814179')
|
||||||
|
main('834255')
|
||||||
|
main('814179')
|
||||||
@@ -46,7 +46,7 @@ max_title_len=50
|
|||||||
update_check=1
|
update_check=1
|
||||||
|
|
||||||
[priority]
|
[priority]
|
||||||
website=javbus,airav,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,fc2club,madou,mv91,javdb
|
website=javbus,airav,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,fc2club,madou,mv91,javdb,gcolle
|
||||||
|
|
||||||
[escape]
|
[escape]
|
||||||
literals=\()/
|
literals=\()/
|
||||||
|
|||||||
Reference in New Issue
Block a user