Add support javlib
This commit is contained in:
28
.github/workflows/main.yml
vendored
28
.github/workflows/main.yml
vendored
@@ -25,11 +25,35 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
- name: Build with pyinstaller
|
# - name: Show cloudsraper package location
|
||||||
|
# run: |
|
||||||
|
# python -c 'import cloudscraper as _; print(_.__path__)'
|
||||||
|
|
||||||
|
- name: Build with pyinstaller (windows)
|
||||||
|
if: matrix.os == 'windows-latest'
|
||||||
run: |
|
run: |
|
||||||
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py
|
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data='C:\\hostedtoolcache\\windows\\Python\\3.7.6\\x64\\lib\\site-packages\\cloudscraper\\;cloudscraper'
|
||||||
|
|
||||||
|
- name: Build with pyinstaller (mac)
|
||||||
|
if: matrix.os == 'macos-latest'
|
||||||
|
run: |
|
||||||
|
pyinstaller \
|
||||||
|
--onefile AV_Data_Capture.py \
|
||||||
|
--hidden-import ADC_function.py \
|
||||||
|
--hidden-import core.py \
|
||||||
|
--add-data='/Users/runner/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper'
|
||||||
|
|
||||||
|
- name: Build with pyinstaller (ubuntu)
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
run: |
|
||||||
|
pyinstaller \
|
||||||
|
--onefile AV_Data_Capture.py \
|
||||||
|
--hidden-import ADC_function.py \
|
||||||
|
--hidden-import core.py \
|
||||||
|
--add-data='/opt/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper'
|
||||||
|
|
||||||
- name: Copy config.ini
|
- name: Copy config.ini
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import requests
|
import requests
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
import cloudscraper
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
|
||||||
@@ -23,23 +24,39 @@ def getXpathSingle(htmlcode,xpath):
|
|||||||
return result1
|
return result1
|
||||||
|
|
||||||
|
|
||||||
|
def get_proxy(proxy: str) -> dict:
|
||||||
|
if proxy:
|
||||||
|
proxies = {"http": "http://" + proxy, "https": "https://" + proxy}
|
||||||
|
else:
|
||||||
|
proxies = {}
|
||||||
|
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
# 网页请求核心
|
# 网页请求核心
|
||||||
def get_html(url, cookies=None):
|
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
|
||||||
proxy, timeout, retry_count = config.Config().proxy()
|
proxy, timeout, retry_count = config.Config().proxy()
|
||||||
|
proxies = get_proxy(proxy)
|
||||||
|
|
||||||
|
if ua is None:
|
||||||
|
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"} # noqa
|
||||||
|
else:
|
||||||
|
headers = {"User-Agent": ua}
|
||||||
|
|
||||||
for i in range(retry_count):
|
for i in range(retry_count):
|
||||||
try:
|
try:
|
||||||
if not proxy == '':
|
if not proxy == '':
|
||||||
proxies = {"http": "http://" + proxy,"https": "https://" + proxy}
|
result = requests.get(str(url), headers=headers, timeout=timeout, proxies=proxies, cookies=cookies)
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'}
|
|
||||||
getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies)
|
|
||||||
getweb.encoding = 'utf-8'
|
|
||||||
return getweb.text
|
|
||||||
else:
|
else:
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
|
result = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
|
||||||
getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
|
|
||||||
getweb.encoding = 'utf-8'
|
result.encoding = "utf-8"
|
||||||
return getweb.text
|
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return result.text
|
||||||
|
|
||||||
except requests.exceptions.ProxyError:
|
except requests.exceptions.ProxyError:
|
||||||
print("[-]Connect retry {}/{}".format(i + 1, retry_count))
|
print("[-]Connect retry {}/{}".format(i + 1, retry_count))
|
||||||
print('[-]Connect Failed! Please check your Proxy or Network!')
|
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||||
@@ -49,11 +66,7 @@ def get_html(url, cookies=None):
|
|||||||
|
|
||||||
def post_html(url: str, query: dict) -> requests.Response:
|
def post_html(url: str, query: dict) -> requests.Response:
|
||||||
proxy, timeout, retry_count = config.Config().proxy()
|
proxy, timeout, retry_count = config.Config().proxy()
|
||||||
|
proxies = get_proxy(proxy)
|
||||||
if proxy:
|
|
||||||
proxies = {"http": "http://" + proxy, "https": "https://" + proxy}
|
|
||||||
else:
|
|
||||||
proxies = {}
|
|
||||||
|
|
||||||
for i in range(retry_count):
|
for i in range(retry_count):
|
||||||
try:
|
try:
|
||||||
@@ -64,3 +77,25 @@ def post_html(url: str, query: dict) -> requests.Response:
|
|||||||
print("[-]Connect Failed! Please check your Proxy or Network!")
|
print("[-]Connect Failed! Please check your Proxy or Network!")
|
||||||
input("Press ENTER to exit!")
|
input("Press ENTER to exit!")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
def get_javlib_cookie() -> [dict, str]:
|
||||||
|
proxy, timeout, retry_count = config.Config().proxy()
|
||||||
|
proxies = get_proxy(proxy)
|
||||||
|
|
||||||
|
raw_cookie = {}
|
||||||
|
user_agent = ""
|
||||||
|
|
||||||
|
# Get __cfduid/cf_clearance and user-agent
|
||||||
|
for i in range(retry_count):
|
||||||
|
try:
|
||||||
|
raw_cookie, user_agent = cloudscraper.get_cookie_string(
|
||||||
|
"http://www.m45e.com/",
|
||||||
|
proxies=proxies
|
||||||
|
)
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-] ProxyError, retry {}/{}".format(i+1, retry_count))
|
||||||
|
except cloudscraper.exceptions.CloudflareIUAMError:
|
||||||
|
print("[-] IUAMError, retry {}/{}".format(i+1, retry_count))
|
||||||
|
|
||||||
|
return raw_cookie, user_agent
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ naming_rule=number+'-'+title
|
|||||||
update_check=1
|
update_check=1
|
||||||
|
|
||||||
[priority]
|
[priority]
|
||||||
website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321
|
website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321,javlib
|
||||||
|
|
||||||
[escape]
|
[escape]
|
||||||
literals=\()/
|
literals=\()/
|
||||||
|
|||||||
2
core.py
2
core.py
@@ -15,6 +15,7 @@ import javbus
|
|||||||
import javdb
|
import javdb
|
||||||
import mgstage
|
import mgstage
|
||||||
import xcity
|
import xcity
|
||||||
|
import javlib
|
||||||
|
|
||||||
|
|
||||||
def escape_path(path, escape_literals: str): # Remove escape literals
|
def escape_path(path, escape_literals: str): # Remove escape literals
|
||||||
@@ -53,6 +54,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON
|
|||||||
"mgstage": mgstage.main,
|
"mgstage": mgstage.main,
|
||||||
"jav321": jav321.main,
|
"jav321": jav321.main,
|
||||||
"xcity": xcity.main,
|
"xcity": xcity.main,
|
||||||
|
"javlib": javlib.main,
|
||||||
}
|
}
|
||||||
|
|
||||||
# default fetch order list, from the beginning to the end
|
# default fetch order list, from the beginning to the end
|
||||||
|
|||||||
109
javlib.py
Normal file
109
javlib.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
import json
|
||||||
|
import bs4
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from lxml import html
|
||||||
|
from http.cookies import SimpleCookie
|
||||||
|
|
||||||
|
from ADC_function import get_javlib_cookie, get_html
|
||||||
|
|
||||||
|
|
||||||
|
def main(number: str):
|
||||||
|
raw_cookies, user_agent = get_javlib_cookie()
|
||||||
|
|
||||||
|
# Blank cookies mean javlib site return error
|
||||||
|
if not raw_cookies:
|
||||||
|
return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
||||||
|
|
||||||
|
# Manually construct a dictionary
|
||||||
|
s_cookie = SimpleCookie()
|
||||||
|
s_cookie.load(raw_cookies)
|
||||||
|
cookies = {}
|
||||||
|
for key, morsel in s_cookie.items():
|
||||||
|
cookies[key] = morsel.value
|
||||||
|
|
||||||
|
# Scraping
|
||||||
|
result = get_html(
|
||||||
|
"http://www.m45e.com/cn/vl_searchbyid.php?keyword={}".format(number),
|
||||||
|
cookies=cookies,
|
||||||
|
ua=user_agent,
|
||||||
|
return_type="object"
|
||||||
|
)
|
||||||
|
soup = BeautifulSoup(result.text, "html.parser")
|
||||||
|
lx = html.fromstring(str(soup))
|
||||||
|
|
||||||
|
if "/?v=jav" in result.url:
|
||||||
|
dic = {
|
||||||
|
"title": get_title(lx, soup),
|
||||||
|
"studio": get_table_el_single_anchor(soup, "video_maker"),
|
||||||
|
"year": get_table_el_td(soup, "video_date")[:4],
|
||||||
|
"outline": "",
|
||||||
|
"director": get_table_el_single_anchor(soup, "video_director"),
|
||||||
|
"cover": get_cover(lx),
|
||||||
|
"imagecut": 1,
|
||||||
|
"actor_photo": "",
|
||||||
|
"website": result.url,
|
||||||
|
"source": "javlib.py",
|
||||||
|
"actor": get_table_el_multi_anchor(soup, "video_cast"),
|
||||||
|
"label": get_table_el_td(soup, "video_label"),
|
||||||
|
"tag": get_table_el_multi_anchor(soup, "video_genres"),
|
||||||
|
"number": get_table_el_td(soup, "video_id"),
|
||||||
|
"release": get_table_el_td(soup, "video_date"),
|
||||||
|
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
dic = {}
|
||||||
|
|
||||||
|
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
|
||||||
|
return lx.xpath(xpath)[0].strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
||||||
|
tag = soup.find(id=tag_id).find("a")
|
||||||
|
|
||||||
|
if tag is not None:
|
||||||
|
return tag.string.strip()
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
||||||
|
tags = soup.find(id=tag_id).find_all("a")
|
||||||
|
|
||||||
|
return process(tags)
|
||||||
|
|
||||||
|
|
||||||
|
def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
|
||||||
|
tags = soup.find(id=tag_id).find_all("td", class_="text")
|
||||||
|
|
||||||
|
return process(tags)
|
||||||
|
|
||||||
|
|
||||||
|
def process(tags: bs4.element.ResultSet) -> str:
|
||||||
|
values = []
|
||||||
|
for tag in tags:
|
||||||
|
value = tag.string
|
||||||
|
if value is not None and value != "----":
|
||||||
|
values.append(value)
|
||||||
|
|
||||||
|
return ",".join(x for x in values if x)
|
||||||
|
|
||||||
|
|
||||||
|
def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
|
||||||
|
title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
|
||||||
|
number = get_table_el_td(soup, "video_id")
|
||||||
|
|
||||||
|
return title.replace(number, "").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_cover(lx: html.HtmlComment) -> str:
|
||||||
|
return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
|
||||||
|
lists = ["DVMC-003"]
|
||||||
|
for num in lists:
|
||||||
|
print(main(num))
|
||||||
@@ -4,3 +4,4 @@ lxml
|
|||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
pillow
|
pillow
|
||||||
pyinstaller
|
pyinstaller
|
||||||
|
cloudscraper
|
||||||
|
|||||||
Reference in New Issue
Block a user