Upate 3.5.1

This commit is contained in:
Yoshiko2
2020-07-17 15:06:42 +08:00
committed by GitHub
parent df3a959852
commit 82315472f6
4 changed files with 25 additions and 7 deletions

View File

@@ -1,12 +1,16 @@
import os import os
import configparser import configparser
import codecs
class Config: class Config:
def __init__(self, path: str = "config.ini"): def __init__(self, path: str = "config.ini"):
if os.path.exists(path): if os.path.exists(path):
self.conf = configparser.ConfigParser() self.conf = configparser.ConfigParser()
self.conf.read(path, encoding="utf-8") try:
self.conf.read(path, encoding="utf-8-sig")
except:
self.conf.read(path, encoding="utf-8")
else: else:
print("[-] Config file not found! Use the default settings") print("[-] Config file not found! Use the default settings")
self.conf = self._default_config() self.conf = self._default_config()

View File

@@ -68,13 +68,22 @@ def getCover_small(a, index=0):
# javdb sometime returns multiple results # javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number # DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] try:
if not 'https' in result: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
result = 'https:' + result if not 'https' in result:
return result result = 'https:' + result
return result
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")).strip(" ['']") try:
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
return result return result
def getDirector(a): def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()

View File

@@ -45,4 +45,4 @@ def get_number(filepath: str) -> str:
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod(raise_on_error=True) doctest.testmod(raise_on_error=True)

5
test.py Normal file
View File

@@ -0,0 +1,5 @@
## 2020.6.22 更新
* 改进网站爬虫子程序参数混乱
* 修复命名规则release参数带```/```的问题
* 新增socks5本地代理连接
* 新增命名规则series参数