Python3.7 抓取中国职教百科高校录取专业备案信息

Python3.7 抓取中国职教百科高校录取专业备案信息

author: he xiaodong date: 2019-04-17

可以参考 http://www.zjchina.org/mspMajorIndexAction.fo?year=2019&tdsourcetag=s_pcqq_aiomsg 地址,看到筛选和页面,筛选表单会post到本页,信息返回格式是dom table中的数据,总数量在页码部分,只有在第一页结果出来的时候,才可以计算到总页数,而第二页和第一页的不同主要是 startcount 这个数字的不同,例如第二页是 startcount=100 在一次抓取获取到总页数之后,如果有多页,则循环其余的页数,如果只有一页,循环下一专业。这个网站没有加反爬虫,代码也没有加切换 header 头信息的部分。

数据库和代码:
majors.sql

CREATE TABLE `majors` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `keyword` varchar(50) DEFAULT NULL COMMENT '关键字',
  `province` varchar(50) DEFAULT NULL COMMENT '省份信息',
  `code` varchar(50) DEFAULT NULL COMMENT '专业代码',
  `name` varchar(50) DEFAULT NULL COMMENT '专业名称',
  `school_code` varchar(50) DEFAULT NULL COMMENT '学校代码',
  `school_name` varchar(255) DEFAULT NULL COMMENT '学校名称',
  `years_limit` tinyint(3) unsigned DEFAULT NULL COMMENT '年限',
  `remark` varchar(255) DEFAULT NULL COMMENT '备注',
  `created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=237 DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;

majors.py

# -*- coding: utf8 -*-
from bs4 import BeautifulSoup
import requests
import csv
import bs4
import pymysql
import math

def get_data(majorname):
    print(majorname)
    url = 'http://www.zjchina.org/mspMajorIndexAction.fo'
    d = {'year': 2019, 'province':'', 'schoolno':'', 'schoolname': '','majorno':'', 'majorname': majorname,'startcount': 0,'gopage':'', 'totalPages': 1,'page_size': 100}
    r = requests.post(url, data=d)
    soup = BeautifulSoup(r.content, 'html.parser')
    body = soup.body
    create(majorname, body)


    # 查询是不是多个页面
    page = body.find('div', {'id': 'page'})
    # print(page)
    pageNum = page.find_all('font')
    num = pageNum[0].text
    maxPage = math.ceil(float(num) / 100)
    print(maxPage)

    # 多个页面的情况下,循环处理新的页面
    if maxPage > 1:
        for p in range(2, maxPage + 1, 1):
            d = {'year': 2019, 'province':'', 'schoolno':'', 'schoolname': '','majorno':'', 'majorname': majorname,'startcount': 0,'gopage':'', 'totalPages': maxPage,'page_size': 100}
            # print(d)
            url = 'http://www.zjchina.org/mspMajorIndexAction.fo?&startcount=' + str((p - 1) * 100)
            r = requests.post(url, data=d)
            soup = BeautifulSoup(r.content, 'html.parser')
            body = soup.body
            create(majorname, body)

    
def create(keyword, body):
    div = body.find('div', {'id': 'A5'})
    data = div.find('table')
    trs = data.find_all('tr')
    tr = trs[1:len(trs)]

    for x in tr:
        td = x.find_all('td')
        province = td[0].text
        code = td[1].text
        name = td[2].text
        school_code = td[3].text
        school_name = td[4].text
        years_limit = td[5].text
        remark = td[6].text

        sql = """insert into majors(keyword,province,code,name,school_code,school_name,years_limit,remark) values ('{}','{}','{}','{}','{}','{}','{}','{}'); """.format(
            keyword,province, code, name, school_code, school_name, years_limit, remark)
        try:
            cur.execute(sql)
            connect.commit()
        except Exception as e:
            print(e)
            pass

if __name__ == '__main__':
    connect = pymysql.connect(
            host = "127.0.0.1",
            user = "root",
            password = "123456",
            db = "business",
            port = 3306,
            charset = "utf8"
        )
    cur = connect.cursor()
    # 爬取的专业信息数组
    majors = ['人物形象设计']
    for major in majors:
        get_data(major)
connect.close()

效果: 爬取结果效果图

© 原创文章