update 5.20 all data
This commit is contained in:
131
code/caict-bg.py
Normal file
131
code/caict-bg.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.19
|
||||
|
||||
import re
|
||||
from urllib import request
|
||||
|
||||
|
||||
# 1.获取数据
|
||||
def get_html(url):
|
||||
# https 模拟浏览器头
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
response=request.urlopen(req)
|
||||
content=response.read().decode('utf-8')
|
||||
return content
|
||||
except:
|
||||
content='test2022!'
|
||||
pass
|
||||
|
||||
|
||||
# 2.处理数据(设置匹配条件)
|
||||
def get_url(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件(第一层,文件标题)
|
||||
pattern = re.compile('<td.*?><span.*?>.*?<a href="(.*?)".*?>(.*?)</a>.*?</td>.*?<td.*?><span.*?>(.*?)</span></td>',re.S)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def get_url2(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件(第2层,文件下载地址)
|
||||
pattern = re.compile('<b>【全文下载】</b>.*?<a href="(.*?)".*?class=kxyj_text>(.*?)</a>',re.S)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# 3.显示数据
|
||||
def show_result(items):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
# 计数器
|
||||
global tsum
|
||||
tsum = 0
|
||||
# 获取待下载文件的第一层地址:文件标题
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
for item in items:
|
||||
url = item[0]
|
||||
cname = item[1]
|
||||
cdate = item[2]
|
||||
url2 = 'http://www.caict.ac.cn/kxyj/qwfb/ztbg' + str(url).replace('./', '/')
|
||||
strs1 = '|[' + cname + '](' + url2 + ')' + '|' + cdate
|
||||
tsum = tsum + 1
|
||||
# print(strs1)
|
||||
# 获取待下载文件的第二层地址:文件下载地址
|
||||
html2 = get_html(url2)
|
||||
items2 = get_url2(html2)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
for item2 in items2:
|
||||
url3 = item2[0]
|
||||
url3 = str(url3)
|
||||
ss = url3[4:10]
|
||||
url3 = 'http://www.caict.ac.cn/kxyj/qwfb/ztbg/' + ss + str(url3).replace('./', '/')
|
||||
uname = item[1]
|
||||
# markdown格式文本
|
||||
strs2 = strs1 + '|[' + uname + '](' + url3 + ')'
|
||||
print(strs2)
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# 4.下载数据(写入文件)
|
||||
def get_file(url):
|
||||
file_name = url.split('/')[-1]
|
||||
# 设置下载路径(根据实际情况修改)
|
||||
file_names='E:/download/'+file_name
|
||||
req = request.Request(url)
|
||||
u = request.urlopen(req)
|
||||
f = open(file_names, 'wb')
|
||||
block_sz = 8192
|
||||
while True:
|
||||
buffer = u.read(block_sz)
|
||||
if not buffer:
|
||||
break
|
||||
f.write(buffer)
|
||||
f.close()
|
||||
print ("Sucessful to download" + " " + file_names)
|
||||
|
||||
|
||||
# 下载最新数据
|
||||
url = 'http://www.caict.ac.cn/kxyj/qwfb/ztbg/index.htm'
|
||||
html = get_html(url)
|
||||
items = get_url(html)
|
||||
show_result(items)
|
||||
|
||||
|
||||
# 下载历史数据
|
||||
tsum = 0 #计数器
|
||||
tsums = 0 #累加计数器
|
||||
# 获得总页数(手动调整)
|
||||
for i in range(1,12):
|
||||
url = 'http://www.caict.ac.cn/kxyj/qwfb/ztbg/index_'+str(i)+'.htm'
|
||||
# 获取信息
|
||||
html = get_html(url)
|
||||
# 获取下载地址
|
||||
items = get_url(html)
|
||||
# 下载显示数据
|
||||
show_result(items)
|
||||
# 计数器累加
|
||||
tsums = tsum + tsums
|
||||
psums = str(tsum)+'/'+str(tsums)
|
||||
print(psums)
|
||||
|
||||
|
||||
# 统计总数
|
||||
tsums = '总计: '+str(tsums)+' 个文件'
|
||||
print(tsums)
|
||||
|
||||
131
code/caict-bps.py
Normal file
131
code/caict-bps.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.19
|
||||
|
||||
import re
|
||||
from urllib import request
|
||||
|
||||
|
||||
# 1.获取数据
|
||||
def get_html(url):
|
||||
# https 模拟浏览器头
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
response=request.urlopen(req)
|
||||
content=response.read().decode('utf-8')
|
||||
return content
|
||||
except:
|
||||
content='test2022!'
|
||||
pass
|
||||
|
||||
|
||||
# 2.处理数据(设置匹配条件)
|
||||
def get_url(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件(第一层,文件标题)
|
||||
pattern = re.compile('<td.*?><span.*?>.*?<a href="(.*?)".*?>(.*?)</a>.*?</td>.*?<td.*?><span.*?>(.*?)</span></td>',re.S)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def get_url2(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件(第2层,文件下载地址)
|
||||
pattern = re.compile('<b>【全文下载】</b>.*?<a href="(.*?)".*?class=kxyj_text>(.*?)</a>',re.S)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# 3.显示数据
|
||||
def show_result(items):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
# 计数器
|
||||
global tsum
|
||||
tsum = 0
|
||||
# 获取待下载文件的第一层地址:文件标题
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
for item in items:
|
||||
url = item[0]
|
||||
cname = item[1]
|
||||
cdate = item[2]
|
||||
url2 = 'http://www.caict.ac.cn/kxyj/qwfb/bps' + str(url).replace('./', '/')
|
||||
strs1 = '|[' + cname + '](' + url2 + ')' + '|' + cdate
|
||||
tsum = tsum + 1
|
||||
# print(strs1)
|
||||
# 获取待下载文件的第二层地址:文件下载地址
|
||||
html2 = get_html(url2)
|
||||
items2 = get_url2(html2)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
for item2 in items2:
|
||||
url3 = item2[0]
|
||||
url3 = str(url3)
|
||||
ss = url3[4:10]
|
||||
url3 = 'http://www.caict.ac.cn/kxyj/qwfb/bps/' + ss + str(url3).replace('./', '/')
|
||||
uname = item[1]
|
||||
# markdown格式文本
|
||||
strs2 = strs1 + '|[' + uname + '](' + url3 + ')'
|
||||
print(strs2)
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# 4.下载数据(写入文件)
|
||||
def get_file(url):
|
||||
file_name = url.split('/')[-1]
|
||||
# 设置下载路径(根据实际情况修改)
|
||||
file_names='E:/download/'+file_name
|
||||
req = request.Request(url)
|
||||
u = request.urlopen(req)
|
||||
f = open(file_names, 'wb')
|
||||
block_sz = 8192
|
||||
while True:
|
||||
buffer = u.read(block_sz)
|
||||
if not buffer:
|
||||
break
|
||||
f.write(buffer)
|
||||
f.close()
|
||||
print ("Sucessful to download" + " " + file_names)
|
||||
|
||||
|
||||
# 下载最新数据
|
||||
url = 'http://www.caict.ac.cn/kxyj/qwfb/bps/index.htm'
|
||||
html = get_html(url)
|
||||
items = get_url(html)
|
||||
show_result(items)
|
||||
|
||||
|
||||
# 下载历史数据
|
||||
tsum = 0 #计数器
|
||||
tsums = 0 #累加计数器
|
||||
# 获得总页数(手动调整)
|
||||
for i in range(1,16):
|
||||
url = 'http://www.caict.ac.cn/kxyj/qwfb/bps/index_'+str(i)+'.htm'
|
||||
# 获取信息
|
||||
html = get_html(url)
|
||||
# 获取下载地址
|
||||
items = get_url(html)
|
||||
# 下载显示数据
|
||||
show_result(items)
|
||||
# 计数器累加
|
||||
tsums = tsum + tsums
|
||||
psums = str(tsum)+'/'+str(tsums)
|
||||
print(psums)
|
||||
|
||||
|
||||
# 统计总数
|
||||
tsums = '总计: '+str(tsums)+' 个文件'
|
||||
print(tsums)
|
||||
|
||||
99
code/caict-qwsj.py
Normal file
99
code/caict-qwsj.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.19
|
||||
|
||||
import re
|
||||
from urllib import request
|
||||
|
||||
|
||||
# 1.获取数据
|
||||
def get_html(url):
|
||||
# https 模拟浏览器头
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
response=request.urlopen(req)
|
||||
# 根据实际地址修改字符集
|
||||
content=response.read().decode('utf-8')
|
||||
return content
|
||||
|
||||
|
||||
# 2.处理数据(设置匹配条件)
|
||||
def get_url(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
pattern = re.compile('<td.*?><span.*?>.*?<a href=\'(.*?)\'.*?>(.*?)</a>.*?</td>.*?<td.*?><span.*?>(.*?)</span></td>',re.S)
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
|
||||
|
||||
# 3.显示数据
|
||||
def show_result(items):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
# 计数器
|
||||
global tsum
|
||||
tsum = 0
|
||||
for item in items:
|
||||
url = item[0]
|
||||
cname = item[1]
|
||||
cdate = item[2]
|
||||
url2 = 'http://www.caict.ac.cn/kxyj/qwfb/qwsj' + str(url).replace('./', '/')
|
||||
strs1 = cdate+'|'+cname +'|'+ url2
|
||||
# 替换多余字符
|
||||
strs1 = strs1.replace('- ','')
|
||||
# 替换换行符(合并为1行)
|
||||
strs1=strs1.replace('\n','')
|
||||
# 计数器加1
|
||||
tsum = tsum+1
|
||||
print(strs1)
|
||||
# 下载文件
|
||||
get_file(url2)
|
||||
|
||||
|
||||
# 4.下载数据(写入文件)
|
||||
def get_file(url):
|
||||
file_name = url.split('/')[-1]
|
||||
# 设置下载路径(根据实际情况修改)
|
||||
file_names='E:/download/'+file_name
|
||||
req = request.Request(url)
|
||||
u = request.urlopen(req)
|
||||
f = open(file_names, 'wb')
|
||||
block_sz = 8192
|
||||
while True:
|
||||
buffer = u.read(block_sz)
|
||||
if not buffer:
|
||||
break
|
||||
f.write(buffer)
|
||||
f.close()
|
||||
print ("Sucessful to download" + " " + file_names)
|
||||
|
||||
|
||||
|
||||
# 下载最新数据
|
||||
url = 'http://www.caict.ac.cn/kxyj/qwfb/qwsj/index.htm'
|
||||
html = get_html(url)
|
||||
items = get_url(html)
|
||||
show_result(items)
|
||||
|
||||
|
||||
# 下载历史数据
|
||||
tsum = 0 #计数器
|
||||
tsums = 0 #累加计数器
|
||||
for i in range(1,28):
|
||||
url = 'http://www.caict.ac.cn/kxyj/qwfb/qwsj/index_'+str(i)+'.htm'
|
||||
# 获取信息
|
||||
html = get_html(url)
|
||||
# 获取下载地址
|
||||
items = get_url(html)
|
||||
# 下载显示数据
|
||||
show_result(items)
|
||||
# 计数器累加
|
||||
tsums = tsum + tsums
|
||||
psums = str(tsum)+'/'+str(tsums)
|
||||
print(psums)
|
||||
|
||||
|
||||
tsums = '总计: '+str(tsums)+' 个文件'
|
||||
print(tsums)
|
||||
|
||||
|
||||
|
||||
137
code/t500.py
Normal file
137
code/t500.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.19
|
||||
|
||||
import re
|
||||
from urllib import request
|
||||
|
||||
|
||||
# 1.获取数据
|
||||
def get_html(url):
|
||||
# https 模拟浏览器头
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
response=request.urlopen(req)
|
||||
# 设置编码
|
||||
content=response.read().decode('utf-8-sig')
|
||||
return content
|
||||
except:
|
||||
content='test2022!'
|
||||
pass
|
||||
|
||||
|
||||
# 2.处理数据(设置匹配条件)
|
||||
def get_url(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件(第一层,文件标题)
|
||||
pattern = re.compile('<div class="swiper-slide">.*?<a href="(.*?)" data-year=".*?">(.*?)</a>.*?</div>',re.S)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def get_url2(content,ctype,iyear):
|
||||
# 根据地址不同,判断设置不同的匹配条件(第2层,文件下载地址)
|
||||
# 不同年度数据格式有差异,匹配规则不同
|
||||
#pattern = re.compile('<tr>\n<td>(.*?)</td>\n<td><a.*?>(.*?)</a></td>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td.*?><span>.*?</span></td></tr>',re.S)
|
||||
cyear = str(iyear)
|
||||
# 英文年度榜
|
||||
# 5列
|
||||
if ctype=='en' and cyear in(2020,2021):
|
||||
pattern=re.compile('<tr>\n<td>(.*?)</td>\n<td><a.*?>(.*?)</a></td>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td.*?><span>.*?</span></td></tr>',re.S)
|
||||
# 6列
|
||||
if ctype=='en' and cyear in(2015,2018,2019):
|
||||
pattern = re.compile('<tr>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td><a.*?>(.*?)</a>.*?</td>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td>(.*?)</td></tr>',re.S)
|
||||
# 6列
|
||||
if ctype=='en' and cyear in(2014,2016,2017):
|
||||
pattern = re.compile('<tr><td>(.*?)</td><td>(.*?)</td><td><a.*?>(.*?)</a>.*?</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>',re.S)
|
||||
# 6列
|
||||
if ctype=='en' and cyear == 2013:
|
||||
pattern = re.compile('<tr>\n.*?<td>(.*?)</td>\n.*?<td>(.*?)</td>\n.*?<td><a.*?>(.*?)</a>.*?</td>\n.*?<td>(.*?)</td>\n.*?<td>(.*?)</td>\n.*?<td>(.*?)</td>\n.*?</tr>',re.S)
|
||||
# 6列
|
||||
if ctype=='en' and cyear == 2012:
|
||||
pattern = re.compile('<tr><td.*?>(.*?)</td><td.*?>(.*?)</td><td.*?><a.*?>(.*?)</a>.*?</td><td.*?>(.*?)</td><td.*?>(.*?)</td><td.*?>(.*?)</td></tr>',re.S)
|
||||
# 6列
|
||||
if ctype=='en' and cyear==2011:
|
||||
pattern = re.compile('<tr>\n<td.*?>(.*?)</td>\n<td.*?>(.*?)</td>\n<td.*?><a.*?>(.*?)</a></td>\n<td.*?>(.*?)</td>\n<td.*?>(.*?)</td>\n<td.*?>(.*?)</td></tr>',re.S)
|
||||
# 5列
|
||||
if ctype=='en' and cyear==2010:
|
||||
pattern = re.compile('<tr>\n<td>(.*?)</td>\n<td>(.*?)</td>\n<td><a.*?>(.*?)</a></td>\n<td>(.*?)</td>\n<td>(.*?)</td></tr>',re.S)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# 3.显示数据
|
||||
def show_result(items):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
# 计数器
|
||||
global tsum
|
||||
tsum = 0
|
||||
# 获取待下载数据列表
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
for item in items:
|
||||
url=item[0]
|
||||
uname=item[1]
|
||||
strs=uname+'|'+url
|
||||
print(strs)
|
||||
# 获取载数据:文件下载地址(不同年度数据格式有差异,匹配规则不同)
|
||||
ctype = 'en'
|
||||
iyear = int(uname)
|
||||
html2 = get_html(url)
|
||||
items2 = get_url2(html2,ctype,iyear)
|
||||
# 处理异常,忽略错误,继续执行
|
||||
try:
|
||||
for item2 in items2:
|
||||
c1=item2[0]
|
||||
c2=item2[1]
|
||||
c3=item2[2]
|
||||
c4=item2[3]
|
||||
c5=item2[4]
|
||||
# markdown 表格格式
|
||||
# if ctype=='cn':
|
||||
cstr='|'+c1+'|'+c2+'|'+c3+'|'+c4+'|'+c5
|
||||
print(cstr)
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# 4.下载数据(写入文件)
|
||||
def get_file(url):
|
||||
file_name = url.split('/')[-1]
|
||||
# 设置下载路径(根据实际情况修改)
|
||||
file_names='E:/download/'+file_name
|
||||
req = request.Request(url)
|
||||
u = request.urlopen(req)
|
||||
f = open(file_names, 'wb')
|
||||
block_sz = 8192
|
||||
while True:
|
||||
buffer = u.read(block_sz)
|
||||
if not buffer:
|
||||
break
|
||||
f.write(buffer)
|
||||
f.close()
|
||||
print ("Sucessful to download" + " " + file_names)
|
||||
|
||||
|
||||
# 下载最新数据
|
||||
# 《财富》世界500强排行榜
|
||||
url = 'https://www.fortunechina.com/fortune500/index.htm'
|
||||
# 《财富》中国500强排行榜
|
||||
#url = 'https://www.fortunechina.com/fortune500/node_4302.htm'
|
||||
|
||||
html = get_html(url)
|
||||
items = get_url(html)
|
||||
show_result(items)
|
||||
|
||||
Reference in New Issue
Block a user