重啟北護課程查詢系統 Day 1 - 頁面爬蟲

Posted on Thu, Feb 8, 2024 Node.JS Web JS

在實作第三方的北護課程查詢系統,我們必須要先有資料才能夠實作,這篇將紀錄如何爬北護的課程查詢系統

檢查頁面 (動態頁面 or 資料 API)

👉🏻

懶人包: 動態頁面

因為是動態頁面,就只能用 Debug 瀏覽器的方式獲取資料

課程資料

隱藏資料

🚨

因為 Google Chrome 會截斷程式碼,這邊請使用 Firefox

👉🏻

這段程式碼我已放在 gist 上: https://gist.github.com/Chinlinlee/445ed09289845c8abb26c45214612a92

抓取資料

以下我使用 Node.js 語言,以及第三方套件管理器 pnpm 進行實作

新建專案

pnpm init
{
	...
  "type": "module",
	...
}

安裝套件

pnpm add puppeteer cherrio

程式碼

👉🏻

建議至 Github 觀看:

index.js

full code
import fsP from "fs/promises";
import puppeteer from "puppeteer";
import { sanitizeTime } from "./utils.js";
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();

const dayNumMapping = Object.freeze({
    "週一": 1,
    "週二": 2,
    "週三": 3,
    "週四": 4,
    "週五": 5,
    "週六": 6,
    "週日": 7
});

/**
 * 前往 "北護課程查詢系統" 頁面
 */
async function gotoQueryPage() {
    await page.goto("https://system10.ntunhs.edu.tw/AcadInfoSystem/Modules/QueryCourse/QueryCourse.aspx");
}

/**
 * 選取最新學期
 */
async function selectLastSemester() {
    // 尋找 "選擇學期" 下拉式選單
    // 為了防止網頁還未載入完成,所以使用了 waitForSelector, timeout 5 秒 
    let semesterSelectElement = await page.waitForSelector("#ContentPlaceHolder1_ddlSem", {
        timeout: 5000,
        visible: true
    });

    // 選擇最新的學期
    // nth-child(2),選擇第二個是因為第一個是 "請選擇學期..."
    let lastSemesterOption = await semesterSelectElement.$("option:nth-child(2)");
    let lastSemester = await lastSemesterOption.evaluate(el => el.value);
    await semesterSelectElement.select(lastSemester);
}

async function typeCourseName() {
    let courseNameInput = await page.waitForSelector("#ContentPlaceHolder1_txtCourseName")
    await courseNameInput.type("國文");
}

/**
 * 點擊 "查詢" 按鈕
 */
async function clickSearchButton() {
    // 尋找 "查詢" 按鈕
    let queryButton = await page.waitForSelector("#ContentPlaceHolder1_btnQuery");
    await queryButton.click();
}

/**
 * 獲取最新學期所有課程資訊
 * @returns {Promise<import("./type").Course[]>}
 */
async function getCoursesContent() {
    let courseTableBlock = await page.waitForSelector("#ContentPlaceHolder1_NewGridView", {
        timeout: 60 * 1000
    });

    let courses = [];

    /** @type { import("puppeteer").ElementHandle<HTMLTableRowElement[]> } */
    let trs = await courseTableBlock.$$("tr");

    let groups = await getGroupsInCoursesTable(trs);

    for (let i = 1; i < groups.length; i++) {
        /** @type { import("puppeteer").ElementHandle<HTMLTableRowElement> } */
        let tr = await courseTableBlock.$("tr[group='" + groups[i] + "']");

        try {
            /** @type { import("./type").Course } */
            let course = {
                semester: await tr.$eval(`span[id*="lblSEMNo"]`, el => el.textContent.trim()),
                department: await tr.$eval(`span[id*="lblGroupName"]`, el => el.textContent.trim()),
                courseType: await tr.$eval(`span[id*="lblCourseTypeName"]`, el => el.textContent.trim()),
                courseFullID: await tr.$eval(`span[id*="hidCOURSEFULLNO"]`, el => el.textContent.trim()),
                courseName: await tr.$eval(`span[id*="lblCourseName"]`, el => el.textContent.trim()),
                courseEngName: await tr.$eval(`span[id*="hidECOURSENAME"]`, el => el.textContent.trim()),
                departmentID: await tr.$eval(`span[id*="hidGROUPNO"]`, el => el.textContent.trim()),
                subjectID: await tr.$eval(`span[id*="lblCourseNo"]`, el => el.textContent.trim()),
                subjectGroup: await tr.$eval(`span[id*="hidCOURSEGROUP"]`, el => el.textContent.trim()),
                grade: await tr.$eval(`span[id*="lblGrade"]`, el => el.textContent.trim()),
                classGroup: await tr.$eval(`span[id*="lblClass"]`, el => el.textContent.trim()),
                credit: await tr.$eval(`span[id*="lblCredit"]`, el => el.textContent.trim()),
                className: await tr.$eval(`span[id*="hidCLASSNAME"]`, el => el.textContent.trim()),
                classID: await tr.$eval(`span[id*="hidCLASSNO"]`, el => el.textContent.trim()),
                totalOfTakingStudents: await tr.$eval(`span[id*="hidTOTALFULLCNT"]`, el => el.textContent.trim()),
                numberOfTakingStudents: await tr.$eval(`span[id*="lblTotalCNT"]`, el => el.textContent.trim()),
                weekNumber: await tr.$eval(`span[id*="hidWEEKDESC"]`, el => el.textContent.trim()),
                multipleTeacherName: await tr.$eval(`div[id*="panMultipleTeachNameLinks"]`, el => el.textContent.trim()),
                note: await tr.$eval(`span[id*="lblRemark"]`, el => el.title),
                coursePlanRelativeUrl: await tr.$eval(`span[id*="hidUploadFile"]`, el => el.textContent.trim()),
                courseAbstract: await tr.$eval(`span[id*="hidABSTRACT"]`, el => el.textContent.trim()),
                courseEngAbstract: await tr.$eval(`span[id*="hidEABSTRACT"]`, el => el.textContent.trim()),

                day: await tr.$eval(`span[id*=lblWeekNo]`, el => el.title)
            };

            let mainTeacherNameEl = await tr.$(`div[id*="panMainTeachNameLinks"]`);
            let mainTeacherName = await mainTeacherNameEl.$eval(`span`, el => el.textContent.trim());
            course.mainTeacherName = mainTeacherName;

            let multipleTeacherNameEl = await tr.$(`div[id*="panMultipleTeachNameLinks"]`);
            let multipleTeacherNames = await multipleTeacherNameEl.$$eval(`span`, els => els.map(el => el.textContent));
            course.multipleTeacherName = multipleTeacherNames.join(", ");

            try {
                course.dayNum = dayNumMapping[course.day];
            } catch (e) {
                course.dayNum = 0;
            }
            

            let courseLocationID = await tr.$eval(`span[id*="lblRoomNo"]`, el => el.textContent.trim());
            let courseLocationName = await tr.$eval(`span[id*="lblRoomNo"]`, el => el.title.trim());
            let courseLocation = courseLocationID ? courseLocationID : courseLocationName;
            course.courseLocation = courseLocation;

            let period = await tr.$eval(`span[id*="lblSecNo"]`, el => el.textContent.trim());
            if (period) {
                let periods = period.matchAll(/(\d+)(?:~(\d+))?/gm).next().value;
                let startPeriod = periods.at(1);
                let endPeriod = periods.length == 3 ? periods.at(2) : periods.at(1);
                course.startPeriod = startPeriod;
                course.endPeriod = endPeriod;
            } else {
                course.startPeriod = "";
                course.endPeriod = "";
            }

            let time = await tr.$eval(`span[id*="lblSecNo"]`, el => el.title.trim());
            let sanitizedTime = sanitizeTime(time);
            course.startTime = sanitizedTime.startTime;
            course.endTime = sanitizedTime.endTime;

            courses.push(course);
        } catch (e) {
            console.error(e);
            console.log(await (await tr.getProperty("outerHTML")).jsonValue());
        }

    }

    return courses;
}

/**
 * 尋找所有的 Group
 * 學校網站會把上下長度太長的表格拆分,所以要用 group 來分辨每個課程
 * group 是從 1 開始遞增的數值
 * @param {import("puppeteer").ElementHandle<HTMLTableRowElement[]>} trs 
 */
async function getGroupsInCoursesTable(trs) {
    let groups = {};
    for(let tr of trs) {
        let group = await tr.evaluate(el => el.getAttribute("group"))
        if (group) groups[group] = 1;
    }
    return Object.keys(groups);
}

/**
 * 把課程資訊寫到 courses.json 檔案
 * @param {import("./type").Course[]} courses 
 */
async function writeCoursesToFile(courses) {
    let coursesString = JSON.stringify(courses, null, 4);
    await fsP.writeFile("courses.json", coursesString);
}

async function doCraw() {
    await gotoQueryPage();
    await selectLastSemester();
    // 快速測試用,只搜尋 "國文" 一門課
    // await typeCourseName();
    await clickSearchButton();
    let courses = await getCoursesContent();
    await writeCoursesToFile(courses);
    console.log("done");
    process.exit(1);
}

doCraw();

utils.js

full code
/**
 * 
 * @param {string} time 
 */
function sanitizeTime(time) {
    let removedChineseTime = time.replace(//gm, "");
    let firstLeftBracket = removedChineseTime.indexOf("(");
    let firstTilde = removedChineseTime.indexOf("~");
    let startTime = removedChineseTime.substring(firstLeftBracket + 1, firstTilde);
    let lastRightBracket = removedChineseTime.lastIndexOf(")");
    let lastTilde = removedChineseTime.lastIndexOf("~");
    let endTime = removedChineseTime.substring(lastTilde + 1, lastRightBracket);

    return {
        startTime,
        endTime
    };
}

export { sanitizeTime };

輸出結果

[
    {
        "semester": "1122",
        "department": "二年制進修部護理系(日間班)",
        "courseType": "通識選修(通識)",
        "courseFullID": "11230028801370",
        "courseName": "休閒與生活",
        "courseEngName": "Leisure and Life",
        "departmentID": "11230",
        "subjectID": "0288",
        "subjectGroup": "01",
        "grade": "3",
        "classGroup": "70",
        "credit": "2",
        "className": "護進日二技3年70班",
        "classID": "11230370",
        "totalOfTakingStudents": "",
        "numberOfTakingStudents": "50",
        "weekNumber": "第1~14週",
        "multipleTeacherName": "吳旻穎",
        "note": "1.畢業班14週課程。\n修課限制:本系二技進修部、本系二技一般生、外系二技進修部、本系四技一般生、外系四技一般生、外系二技一般生、畢業班課程。\n本課程安排多次校外教學和戶外活動,欲選修前,請審慎評估自身體力、健康狀況及耐候程度,以確保個人安全。",
        "coursePlanRelativeUrl": "",
        "courseAbstract": "本課程旨在從學生經由電影休閒活動,引導學生了解更深層生活之中的文化領域,再利用文學閱讀的休閒活動,更進一步進行個人的寫作創作,開發未獲運用的身心組織及功能,更積極地反饋為個人的成長,增進個人身心發展與成熟,進而轉換成品德與自我價值體系。",
        "courseEngAbstract": "The aim of this course is to guide students explore their culture by watching movies, reading books, and writing novel. To borrow the ideas of the movies, students can  create their own novel. In term of those leisure activities, students can develop their personal growth and create their value system.",
        "day": "週一",
        "mainTeacherName": "吳旻穎",
        "dayNum": 1,
        "courseLocation": "F410",
        "startPeriod": "5",
        "endPeriod": "7",
        "startTime": "12:40",
        "endTime": "15:30"
    }
]