add tjnq.py tjgb.py miit.py internet.py
This commit is contained in:
76
code/internet.py
Normal file
76
code/internet.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.18
|
||||
|
||||
import re
|
||||
from urllib import request
|
||||
|
||||
# 1.获取数据
|
||||
def get_html(url):
|
||||
# https 模拟浏览器头
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
response=request.urlopen(req)
|
||||
# 根据实际地址修改字符集
|
||||
content=response.read().decode('utf-8')
|
||||
return content
|
||||
|
||||
# 2.处理数据(设置匹配条件)
|
||||
def get_url(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
pattern = re.compile('<li><div.*?><a href=\'(.*?)\'.*?>(.*?)</a></div><div.*?>(.*?)</div></li>',re.S)
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
|
||||
# 3.显示数据
|
||||
def show_result(items):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
for item in items:
|
||||
curl = item[0]
|
||||
cname = item[1]
|
||||
# 生成完整的链接地址
|
||||
if str(cname).find('次')>-1:
|
||||
curl=str(curl).replace('./','/')
|
||||
curl = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg'+str(curl)
|
||||
strs = cname+'|'+curl
|
||||
# 显示链接信息
|
||||
print(strs)
|
||||
# 下载数据
|
||||
get_file(curl)
|
||||
|
||||
|
||||
# 4.下载数据(写入文件)
|
||||
def get_file(url):
|
||||
file_name = url.split('/')[-1]
|
||||
# 设置下载路径(根据实际情况修改)
|
||||
file_names='E:/download/'+file_name
|
||||
req = request.Request(url)
|
||||
u = request.urlopen(req)
|
||||
f = open(file_names, 'wb')
|
||||
block_sz = 8192
|
||||
while True:
|
||||
buffer = u.read(block_sz)
|
||||
if not buffer:
|
||||
break
|
||||
f.write(buffer)
|
||||
f.close()
|
||||
print ("Sucessful to download" + " " + file_names)
|
||||
|
||||
|
||||
# 下载1~37次
|
||||
for i in range(1,7):
|
||||
url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index_'+str(i)+'.htm'
|
||||
# 获取信息
|
||||
html = get_html(url)
|
||||
# 获取下载地址
|
||||
items = get_url(html)
|
||||
# 下载显示数据
|
||||
show_result(items)
|
||||
|
||||
# 下载38~49次
|
||||
url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index.htm'
|
||||
html = get_html(url)
|
||||
items = get_url(html)
|
||||
show_result(items)
|
||||
|
||||
46
code/miit.py
Normal file
46
code/miit.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.18
|
||||
|
||||
from urllib import request
|
||||
import re
|
||||
|
||||
# 工信数据
|
||||
url = 'https://www.miit.gov.cn/gxsj/index.html'
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
response=request.urlopen(req)
|
||||
content=response.read().decode('utf-8-sig')
|
||||
|
||||
# 获得最新行业数据
|
||||
pattern = re.compile('<p><a href="(.*?)".*?title="(.*?)">.*?</a><span>(.*?)</span></p>',re.S)
|
||||
items = re.findall(pattern,content)
|
||||
for item in items:
|
||||
url=item[0]
|
||||
uname=item[1]
|
||||
utime=item[2]
|
||||
url='https://www.miit.gov.cn'+url
|
||||
if str(url).find('yclgy')>-1:
|
||||
strs='|原材料工业'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
if str(url).find('zbgy')>-1:
|
||||
strs='|装备工业'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
if str(url).find('xfpgy')>-1:
|
||||
strs='|消费品工业'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
if str(url).find('txy')>-1:
|
||||
strs='|通信业'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
if str(url).find('dzxx')>-1:
|
||||
strs='|电子信息制造业'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
if str(url).find('rjy')>-1:
|
||||
strs='|软件业'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
if str(url).find('hlw')>-1:
|
||||
strs='|互联网'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
if str(url).find('wlaq')>-1:
|
||||
strs='|网络安全'+'|['+uname+']('+url+')|'+utime
|
||||
print(strs)
|
||||
80
code/tjgb.py
Normal file
80
code/tjgb.py
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.18
|
||||
|
||||
import re
|
||||
from urllib import request
|
||||
|
||||
# 1.获取数据
|
||||
def get_html(url):
|
||||
# https 模拟浏览器头
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
response=request.urlopen(req)
|
||||
# 根据实际地址修改字符集
|
||||
content=response.read().decode('utf-8')
|
||||
return content
|
||||
|
||||
# 2.处理数据(设置匹配条件)
|
||||
def get_url(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
pattern = re.compile('<a href="(.*?)".*?><span.*?><img.*?><font.*?>(.*?)</font><font.*?>(.*?)</font></span></a>',re.S)
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
|
||||
# 3.显示数据
|
||||
def show_result(items):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
for item in items:
|
||||
curl = item[0]
|
||||
cname = item[1]
|
||||
# 生成完整的链接地址
|
||||
#http://www.stats.gov.cn/tjsj/zxfb/201602/t20160229_1323991.html
|
||||
if str(cname).find('年')>-1:
|
||||
cyear = str(cname).replace('年','')
|
||||
iyear = int(cyear)
|
||||
if iyear>2012:
|
||||
curl = 'http://www.stats.gov.cn'+str(curl)
|
||||
if iyear>=1978 and iyear<=2012:
|
||||
curl=str(curl).replace('./','/')
|
||||
curl = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb'+str(curl)
|
||||
strs = '|['+cname+']('+curl+')'
|
||||
# 显示链接信息
|
||||
print(strs)
|
||||
# 下载数据
|
||||
get_file(curl)
|
||||
|
||||
# 4.下载数据(写入文件)
|
||||
def get_file(url):
|
||||
file_name = url.split('/')[-1]
|
||||
# 修改路径
|
||||
file_names='E:/download/'+file_name
|
||||
req = request.Request(url)
|
||||
u = request.urlopen(req)
|
||||
f = open(file_names, 'wb')
|
||||
block_sz = 8192
|
||||
while True:
|
||||
buffer = u.read(block_sz)
|
||||
if not buffer:
|
||||
break
|
||||
f.write(buffer)
|
||||
f.close()
|
||||
print ("Sucessful to download" + " " + file_name)
|
||||
|
||||
# 下载(2013~2021)
|
||||
# 2022-2021年
|
||||
#url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index.html'
|
||||
# 1983-2001年
|
||||
#url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index_1.html'
|
||||
#
|
||||
url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index_2.html'
|
||||
|
||||
# 获取信息
|
||||
html = get_html(url)
|
||||
# 获取下载地址
|
||||
items = get_url(html)
|
||||
# 下载显示数据
|
||||
show_result(items)
|
||||
|
||||
|
||||
69
code/tjnq.py
Normal file
69
code/tjnq.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# Author: yiliyas
|
||||
# Date: 2022.5.18
|
||||
|
||||
import re
|
||||
from urllib import request
|
||||
|
||||
# 1.获取数据
|
||||
def get_html(url):
|
||||
# https 模拟浏览器头
|
||||
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
|
||||
req = request.Request(url,headers=headers)
|
||||
response=request.urlopen(req)
|
||||
content=response.read().decode('gb2312')
|
||||
return content
|
||||
|
||||
# 2.处理数据(设置匹配条件)
|
||||
def get_url(content):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
pattern = re.compile('<li><a href=\'(.*?)\'>(.*?)</a></li>',re.S)
|
||||
items = re.findall(pattern,content)
|
||||
return items
|
||||
|
||||
# 3.显示数据
|
||||
def show_result(items):
|
||||
# 根据地址不同,判断设置不同的匹配条件
|
||||
for item in items:
|
||||
curl = item[0]
|
||||
cname = item[1]
|
||||
# 生成完整的链接地址
|
||||
curl = 'http://www.stats.gov.cn/tjsj/ndsj/'+str(iyear)+'/'+str(curl)
|
||||
strs = '|['+cname+']('+curl+')'
|
||||
# 显示链接信息
|
||||
print(strs)
|
||||
# 下载数据
|
||||
get_file(curl)
|
||||
|
||||
# 4.下载数据(写入文件)
|
||||
def get_file(url):
|
||||
file_name = url.split('/')[-1]
|
||||
# 修改路径(文件名加入年份,避免重复)
|
||||
file_names='E:/download/'+str(iyear)+file_name
|
||||
req = request.Request(url)
|
||||
u = request.urlopen(req)
|
||||
f = open(file_names, 'wb')
|
||||
block_sz = 8192
|
||||
while True:
|
||||
buffer = u.read(block_sz)
|
||||
if not buffer:
|
||||
break
|
||||
f.write(buffer)
|
||||
f.close()
|
||||
print ("Sucessful to download" + " " + file_names)
|
||||
|
||||
# 下载(2005~2021)
|
||||
# 定义全局变量:iyear 历年
|
||||
global iyear
|
||||
#url = 'http://www.stats.gov.cn/tjsj/ndsj/2021/left.htm'
|
||||
for iyear in range(2005,2021):
|
||||
url = 'http://www.stats.gov.cn/tjsj/ndsj/'+str(iyear)+'/left.htm'
|
||||
# 获取信息
|
||||
html = get_html(url)
|
||||
# 获取下载地址
|
||||
items = get_url(html)
|
||||
# 下载显示数据
|
||||
show_result(items)
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
## [中国政府网](http://www.gov.cn/shuju/index.htm) :+1::+1::+1:
|
||||
- [政府工作报告_历年(2014~2021)](http://www.gov.cn/guowuyuan/zfgzbg.htm)
|
||||
😄 [中国共产党一百年大事记(1921年7月-2021年6月)](../top/t100.md)
|
||||
😄 [商务部-商业特许经营备案查询](http://txjy.syggs.mofcom.gov.cn)
|
||||
😄 [中国执行信息公开网](http://zxgk.court.gov.cn/)
|
||||
- 站内搜索——[十四五通知](http://sousuo.gov.cn/s.htm?q=%E5%8D%81%E5%9B%9B%E4%BA%94%E9%80%9A%E7%9F%A5&n=10&p=1&t=govall&timetype=&mintime=&maxtime=&sort=&sortType=1&nocorrect=)
|
||||
|
||||
34
data/data.md
34
data/data.md
@@ -1,33 +1,33 @@
|
||||
# 官方权威统计数据(更新日期:2022-5-15)
|
||||
# 官方权威统计数据(更新日期:2022-5-18)
|
||||
|
||||
|数据名称|本地镜像|shell下载|python下载|说明|
|
||||
|:----|:----|:----|:----|:----|
|
||||
|[1.中国统计年鉴](http://www.stats.gov.cn/tjsj/ndsj/)|[镜像](./data-tjnq.md)|[shell](../code/tjnq.md)|[python](../code/test.py)|中文版:1999-2021年,22年官方权威数据 英文版:2004-2021年,17年官方权威数据
|
||||
|[2.全国年度统计公报](http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/) |[镜像](./data-tjgb.md) |[shell](../code/tjgb.md)|[python](../code/test.py)|1978-2020年,42年官方权威数据
|
||||
|[3.工信数据(工业和信息化部)](https://www.miit.gov.cn/gxsj/index.html) |[镜像](./data-miit.md)|[shell](../code/test.md)|[python](../code/test.py)||
|
||||
|[4.中国互联网络发展状况统计报告](http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index.htm)|[镜像](./data-internet.md)|[shell](../code/internet.md)|[python](../code/test.py)|1998-2022年,24年官方权威数据
|
||||
|[5.中国政府网](http://www.gov.cn/shuju/index.htm)|[镜像](./data-gov.md)|[shell](../code/test.md)|[python](../code/test.py)||
|
||||
|[1.中国统计年鉴](http://www.stats.gov.cn/tjsj/ndsj/)|[镜像](./data-tjnq.md)|[shell](../code/tjnq.md)|[python](../code/tjnq.py)|中文版:1999-2021年,22年官方权威数据 英文版:2004-2021年,17年官方权威数据
|
||||
|[2.全国年度统计公报](http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/) |[镜像](./data-tjgb.md) |[shell](../code/tjgb.md)|[python](../code/tjgb.py)|1978-2020年,42年官方权威数据
|
||||
|[3.工信数据(工业和信息化部)](https://www.miit.gov.cn/gxsj/index.html) |[镜像](./data-miit.md)| |[python](../code/miit.py)|八大行业:原材料工业,装备工业,消费品工业,通信业,电子信息制造业,软件业,互联网,网络安全|
|
||||
|[4.中国互联网络发展状况统计报告](http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index.htm)|[镜像](./data-internet.md)|[shell](../code/internet.md)|[python](../code/internet.py)|1998-2022年,24年官方权威数据
|
||||
|[5.中国政府网](http://www.gov.cn/shuju/index.htm)|[镜像](./data-gov.md)| | |GDP、CPI、PPI |
|
||||
|[6.中国信通院](http://www.caict.ac.cn/kxyj/) |[镜像](./data-caict-bps.md)|[shell](../code/test.md)|[python](../code/test.py)||
|
||||
|😄[白皮书:271 个](http://www.caict.ac.cn/kxyj/qwfb/bps)|[镜像](./data-caict-bps.md)|[shell](./data-caict-bps.md)|[python](../code/test.py)||
|
||||
|😄[专题报告:196 个](http://www.caict.ac.cn/kxyj/qwfb)|[镜像](./data-caict-ztbg.md)|[shell](./data-caict-ztbg.md)|[python](../code/test.py)||
|
||||
|😄[权威数据:487 个](http://www.caict.ac.cn/kxyj/qwfb/qwsj/)|[镜像](./data-caict-qwsj.md)||[shell](./data-caict-qwsj.md)|[python](../code/test.py)||
|
||||
|[7.财富FORTUNE 500强](https://www.fortunechina.com/rankings/node_11663.htm)|[镜像](../top/t500.md)|[shell](../code/test.md)|[python](../code/test.py)||
|
||||
|8.上海数据|[镜像](./data-sh.md)|-|-|统计公报,统计年签,社会民生数据|
|
||||
|😄[白皮书:271 个](http://www.caict.ac.cn/kxyj/qwfb/bps)|[镜像](./data-caict-bps.md)| |[python](../code/test.py)|权威发布:ICT行业白皮书|
|
||||
|😄[专题报告:196 个](http://www.caict.ac.cn/kxyj/qwfb)|[镜像](./data-caict-ztbg.md)| |[python](../code/test.py)|权威发布:ICT行业专题报告|
|
||||
|😄[权威数据:487 个](http://www.caict.ac.cn/kxyj/qwfb/qwsj/)|[镜像](./data-caict-qwsj.md)| |[python](../code/test.py)|权威发布:ICT行业权威数据|
|
||||
|[7.财富FORTUNE 500强](https://www.fortunechina.com/rankings/node_11663.htm)|[镜像](../top/t500.md)|[shell](../code/test.md)|[python](../code/test.py)|世界500强,中国500强|
|
||||
|8.上海数据|[镜像](./data-sh.md)| | |统计公报,统计年签,社会民生数据|
|
||||
|[上海统计公报](http://tjj.sh.gov.cn/tjgb/index.html)|[镜像](./data-sh.md)|[shell](../code/test.md)|[python](../code/test.py)|2002-2021年,20年官方权威数据|
|
||||
|[上海统计年签](http://tjj.sh.gov.cn/tjnj/index.html)|[镜像](./data-sh.md)|[shell](../code/test.md)|[python](../code/test.py)|2004-2021年,18年官方权威数据|
|
||||
|[上海市卫生健康委员会](https://wsjkw.sh.gov.cn/xwfb/index.html)|-|-|[python](../code/test.py)|疫情数据,权威发布|
|
||||
|[上海市商务委员会-绿卡通行证企业大全](https://sww.sh.gov.cn/swdt/index.html)|[镜像](./data-sh-yq.md)|-|[python](../code/test.py)|疫情保障,商务动态|
|
||||
|[9.艾瑞咨询——产业研究报告](https://www.iresearch.com.cn/report.shtml)|-|-|-|新经济与产业数字化,专业咨询服务领导品牌|
|
||||
|[上海市卫生健康委员会](https://wsjkw.sh.gov.cn/xwfb/index.html)| | | |疫情数据发布|
|
||||
|[上海市商务委员会-绿卡通行证企业大全](https://sww.sh.gov.cn/swdt/index.html)|[镜像](./data-sh-yq.md)| |[python](../code/test.py)|疫情保障,商务动态|
|
||||
|[9.艾瑞咨询——产业研究报告](https://www.iresearch.com.cn/report.shtml)| | | |新经济与产业数字化,专业咨询服务领导品牌|
|
||||
|😄最新报告|[镜像](./data-report.md)|[shell](../code/test.md)|[python](../code/test.py)||
|
||||
|😄行业分类报告-24个分类|[镜像](./data-report-type.md)|[shell](../code/test.md)|[python](../code/test.py)||
|
||||
|😄行业年度报告-(2012年——2022年)|[镜像](./data-report-year.md)| [shell](../code/test.md)|[python](../code/test.py)||
|
||||
|10. AI公开数据集|-|-|-|(^_^最好的在最后^_^)|
|
||||
|[最全中文NLP资源库](https://github.com/fighting41love/funNLP)|😄[镜像](../ai/funnlp.md)|-|-|五星推荐,用了都说好!!!|
|
||||
|10. AI公开数据集| | | |(^_^最好的在最后^_^)|
|
||||
|[最全中文NLP资源库](https://github.com/fighting41love/funNLP)|😄[镜像](../ai/funnlp.md)| | |五星推荐,用了都说好!!!|
|
||||
|开放数据集(Open Dataset)|[镜像](../ai/funnlp.md)|[shell](../code/test.md)|[python](../code/test.py)|建设中|
|
||||
|ml机器学习数据集|[镜像](../ai/funnlp.md)|[shell](../code/test.md)|[python](../code/test.py)|建设中|
|
||||
|dl深度学习数据集|[镜像](../ai/funnlp.md)|[shell](../code/test.md)|[python](../code/test.py)|建设中|
|
||||
|大学开放数据集|[镜像](../ai/funnlp.md)|[shell](../code/test.md)|[python](../code/test.py)|建设中|
|
||||
|[UCI 加州大学欧文分校机器学习存储库](http://archive.ics.uci.edu/ml/datasets.php)|[镜像](../ai/uci.md)|-|[python](../code/test.py)|Machine Learning Repository DataSets
|
||||
|[UCI 加州大学欧文分校机器学习存储库](http://archive.ics.uci.edu/ml/datasets.php)|[镜像](../ai/uci.md)| |[python](../code/test.py)|Machine Learning Repository DataSets
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user