init: 添加 GLUT 学分统计爬虫项目

- 新增 README.md 文件,包含项目描述、初始化步骤和使用方法
- 添加 default.env 文件,用于配置环境变量
- 实现 glut.py,包含登录、获取成绩、解析成绩等功能
- 添加 index.html,提供 Web 界面展示成绩
- 实现 main.py,提供命令行接口
- 添加 requirements.txt,列出项目依赖
- 实现 server.py,提供 HTTP 服务接口
This commit is contained in:
ZZY 2024-12-12 22:13:16 +08:00
commit 34770cfeff
7 changed files with 496 additions and 0 deletions

40
README.md Normal file
View File

@ -0,0 +1,40 @@
# GLUT spider web
- 项目描述
- 用于获取GLUT学分统计即爬虫案例
- 初始化项目
```shell
# Windows
python -m venv .venv
pip install -r requirements.txt
# Linux
python3 -m venv .venv
pip3 install -r requirements.txt
```
- 配置环境变量( **推荐配置** )
`default.env`内容填写完整后改名为`.env`
- 使用方法
```shell
# 帮助文档
python main.py -h
# 开启服务器
python main.py -s
# 使用命令行操作,如果没有配置环境变量则需要给出用户名和密码
# 注意默认会输出scores.json
> python main.py -t 2023_sp
Fetching scores for 2023 sp...
aggregate_score: 72.32824427480917
> python main.py -t 2023_sp -u 32220520xxxxx -p xxxxx
Fetching scores for 2023 sp...
aggregate_score: 72.32824427480917
```

3
default.env Normal file
View File

@ -0,0 +1,3 @@
USERNAME=3220520xxxxx
PASSWORD=YourPassword
PORT=8000

179
glut.py Normal file
View File

@ -0,0 +1,179 @@
from dataclasses import dataclass, asdict
from enum import Enum, auto
import requests
from lxml import etree
class Term(Enum):
SPRING = auto()
AUTUMN = auto()
ALL = auto()
def __repr__(self):
return self.name.lower()
class GLUTAcademic:
class LoginFailedError(Exception):
pass
FINAL_GRADE_DICT = {
'优秀': 95,
'': 85,
'': 75,
'及格': 65,
'不及格': 40,
'': 0,
}
@dataclass
class Score:
year: int
term: Term
department: str
course_number: int
course_name: str
course_sequence: int
instructor: str
final_grade: int
gpa: float
credits: float
hours: float
assessment_method: str
course_attribute: str
notes: str
exam_type: str
retake_flag: str
course_requirements: str
course_category: str
coefficient: str
second_degree_minor: str
pass_flag: str
def as_dict(self):
d = asdict(self)
d['term'] = repr(self.term)
return d
def __str__(self):
return (f"Score(学年={self.year}, 学期={self.term}, 开课院系={self.department}, "
f"课程号={self.course_number}, 课程名={self.course_name}, 课序号={self.course_sequence}, "
f"主讲教师={self.instructor}, 总评={self.final_grade}, 绩点={self.gpa}, "
f"学分={self.credits}, 学时={self.hours}, 考核方式={self.assessment_method}, "
f"选课属性={self.course_attribute}, 备注={self.notes}, 考试性质={self.exam_type}, "
f"是否缓考={self.retake_flag}, 课程要求={self.course_requirements}, "
f"课程类别={self.course_category}, 系数={self.coefficient}, "
f"二学位辅修={self.second_degree_minor}, 及格标志={self.pass_flag})")
def __init__(self, username: int, password: str):
self.session = requests.Session()
self.username = username
self.password = password
self.login_status = False
self.base_url = 'https://jw.glut.edu.cn'
def login(self):
url = f"{self.base_url}/academic/j_acegi_security_check"\
f"?j_username={self.username}&j_password={self.password}&j_captcha=undefined"
res = self.session.get(url)
if res.status_code == 200 and res.headers.get('X-Frame-Options') != 'DENY':
self.login_status = True
else:
raise self.LoginFailedError('login error maybe username or password is wrong')
def get_cookies(self):
if not self.login_status:
return None
return '; '.join([f'{key}={value}' for key, value in self.session.cookies.items()])
def get_scores_raw(self, year: int = 2024, term: Term = Term.ALL):
url = f"{self.base_url}/academic/manager/score/studentOwnScore.do"
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
term_dict = {
Term.SPRING: "1",
Term.AUTUMN: "2",
Term.ALL: ""
}
data = {
'year': year - 2013 + 33,
'term': term_dict[term],
'prop': '',
'groupName': '',
'para': '0',
'sortColumn': '',
'Submit': '查询'
}
res = self.session.post(url, data=data, headers=headers)
if res.status_code != 200 or res.headers.get('X-Frame-Options') == 'DENY':
raise RuntimeError("Failed to get scores")
return res.content
def parse_scores(self, raw_score: bytes) -> list[Score]:
tree = etree.fromstring(raw_score, parser=etree.HTMLParser(encoding='utf-8'))
table = tree.xpath('//table[@class="datalist"]')
if not table or len(table) != 1:
err_msg = f"Failed to parse scores {table}"
with open('error.html', 'wb') as f:
f.write(raw_score)
raise RuntimeError(err_msg)
table = table[0]
tr_list = table.xpath('tr')[1:]
return [self.parse_scope(tr.xpath('td/text()')) for tr in tr_list]
def parse_scope(self, texts: list[str]) -> Score:
texts = [t.strip() for t in texts]
final_grade = self.FINAL_GRADE_DICT.get(texts[7], -1)
if final_grade == -1:
try:
final_grade = float(texts[7])
except ValueError as exc:
raise ValueError('Final grade is not a int number') from exc
term_map = {
'': Term.SPRING,
'': Term.AUTUMN
}
term = term_map.get(texts[1])
if term is None:
raise ValueError(f'term error maybe 春 or 秋 but got {texts[1]}')
return self.Score(
year = int(texts[0]),
term = term,
department= texts[2],
course_number = int(texts[3]),
course_name = texts[4],
course_sequence = int(texts[5]),
instructor = texts[6],
final_grade = final_grade,
gpa = float(texts[8]),
credits = float(texts[9]),
hours = float(texts[10]),
assessment_method = texts[11],
course_attribute = texts[12],
notes = texts[13],
exam_type = texts[14],
retake_flag = texts[15],
course_requirements = texts[16],
course_category = texts[17],
coefficient = texts[18],
second_degree_minor = texts[19],
pass_flag = texts[20]
)
def get_scores(self, year: int = 2024, term: Term = Term.ALL) -> list[Score]:
raw_score = self.get_scores_raw(year, term)
return self.parse_scores(raw_score)
def calculate_scores(self, scores: list[Score]) -> float:
if not scores:
return 0.0
total_credits = 0
total_grades = 0
for i in scores:
if i.course_attribute != '必修' or i.course_name.startswith('体育'):
continue
total_credits += i.credits
total_grades += i.final_grade * i.credits
return total_grades / total_credits

137
index.html Normal file
View File

@ -0,0 +1,137 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>成绩展示</title>
<style>
table {
width: 100%;
border-collapse: collapse;
}
th, td {
border: 1px solid black;
padding: 8px;
text-align: left;
}
th {
background-color: #f2f2f2;
}
.form-group {
margin-bottom: 15px;
}
label {
display: block;
margin-bottom: 5px;
}
input[type="text"], input[type="number"] {
width: 100%;
padding: 8px;
box-sizing: border-box;
}
button {
padding: 10px 15px;
background-color: #007BFF;
color: white;
border: none;
cursor: pointer;
}
button:hover {
background-color: #0056b3;
}
</style>
</head>
<body>
<h1>成绩展示</h1>
<div class="form-group">
<label for="year">学年:</label>
<input type="number" id="year" name="year" value="2023">
</div>
<div class="form-group">
<label for="term">学期:</label>
<input type="text" id="term" name="term" value="SUMMER">
</div>
<div class="form-group">
<label for="username">用户名:</label>
<input type="text" id="username" name="username">
</div>
<div class="form-group">
<label for="password">密码:</label>
<input type="password" id="password" name="password">
</div>
<!-- <button id="jmpLinkButton" style="display:none;" onclick="jmpLinkWithCookies()">教务管理系统</button> -->
<button onclick="fetchScores()">获取成绩</button>
<h5>PGA: <span id="pga"></span></h5>
<table id="scoresTable">
<thead>
<tr>
<th>课程名</th>
<th>总评成绩</th>
<th>绩点</th>
</tr>
</thead>
<tbody>
</tbody>
</table>
<input type="hidden" id="cookies" value="">
<script>
async function jmpLinkWithCookies() {
var url = "https://jw.glut.edu.cn/academic/index_new.jsp";
var cookies = document.getElementById('cookies').value;
try {
const response = await fetch(url, {
method: 'GET',
credentials: 'include',
headers: {
'Cookie': cookies
}
});
if (response.ok) {
window.open(url);
} else {
alert('无法跳转到教务系统');
}
} catch (error) {
alert('无法跳转到教务系统 error');
}
}
function fetchScores() {
const year = document.getElementById('year').value;
const term = document.getElementById('term').value;
const username = document.getElementById('username').value;
const password = document.getElementById('password').value;
const url = `/get_scores?year=${encodeURIComponent(year)}&term=${encodeURIComponent(term)}&username=${encodeURIComponent(username)}&password=${encodeURIComponent(password)}`;
fetch(url)
.then(response => response.json())
.then(data => {
const scores = data.scores;
const tableBody = document.querySelector('#scoresTable tbody');
tableBody.innerHTML = '';
data.pga;
document.getElementById("pga").innerHTML = data.pga;
scores.forEach(score => {
const row = document.createElement('tr');
row.innerHTML = `
<td>${score.course_name}</td>
<td>${score.final_grade}</td>
<td>${score.gpa}</td>
`;
tableBody.appendChild(row);
});
// document.getElementById('jmpLinkButton').style.display = 'block';
document.getElementById('cookies').value = data.cookies;
})
.catch(error => console.error('Error fetching scores:', error));
}
</script>
</body>
</html>

72
main.py Normal file
View File

@ -0,0 +1,72 @@
import json
import argparse
from os import getenv
from dotenv import load_dotenv
from glut import GLUTAcademic, Term
from server import run as server_run
load_dotenv()
def fetch_scores(glut_academic: GLUTAcademic, terms: list[str]):
all_scores:list[GLUTAcademic.Score] = []
for term in terms:
year, term_str = term.split('_')
term_str = term_str.lower()
if term_str == 'sp':
term = Term.SPRING
elif term_str == 'au':
term = Term.AUTUMN
else:
raise ValueError(f'Invalid term: {term}, must be one of 2023_sp, 2024_au')
print(f'Fetching scores for {year} {term_str}...')
all_scores.extend(
glut_academic.get_scores(int(year), term)
)
return json.dumps([score.as_dict() for score in all_scores],
ensure_ascii=False, indent=4), all_scores
def main():
parser = argparse.ArgumentParser(
description='GLUT Academic Score Calculator',
)
parser.add_argument('-u', '--username', type=str, default=getenv('USERNAME', default=None))
parser.add_argument('-p', '--password', type=str, default=getenv('PASSWORD', default=None))
parser.add_argument('-s', '--server', action='store_true',
help='Run the server instead of fetching scores')
parser.add_argument('-t', '--terms', type=str, nargs='+',
help='Year and term combinations (e.g., 2023_sp 2024_au)')
parser.add_argument('-o', '--output', type=str, default='scores.json',
help='Output file name (default: scores.json)')
parser.add_argument('-q', '--quiet', action='store_true', default=True,
help='Suppress terminal output')
parser.add_argument('--disable-file-output', action='store_true',
help='Disable file output')
parser.add_argument('--port', type=int, default=int(getenv('PORT', default='8000')),
help='Port number for the server')
args = parser.parse_args()
if not args.username or not args.password:
raise ValueError("USERNAME and PASSWORD must be set in the .env file")
if args.server:
server_run(port=args.port)
else:
glut_acdemic = GLUTAcademic(int(args.username), args.password)
glut_acdemic.login()
res, origin_li = fetch_scores(glut_acdemic, args.terms)
if not args.disable_file_output:
with open('scores.json', 'w', encoding='utf-8') as f:
f.write(res)
print(f"aggregate_score: {glut_acdemic.calculate_scores(origin_li)}")
if not args.quiet:
print(res)
if __name__ == '__main__':
main()

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
requests
lxml
python-dotenv

62
server.py Normal file
View File

@ -0,0 +1,62 @@
import json
from http.server import SimpleHTTPRequestHandler, HTTPServer
from urllib.parse import urlparse, parse_qs
from glut import GLUTAcademic, Term
class RequestHandler(SimpleHTTPRequestHandler):
def do_GET(self):
parsed_path = urlparse(self.path)
query_params = parse_qs(parsed_path.query)
if parsed_path.path == '/get_scores':
year = int(query_params.get('year', [2023])[0])
term = query_params.get('term', ['SUMMER'])[0]
username = query_params.get('username', [None])[0]
password = query_params.get('password', [None])[0]
if not username or not password:
self.send_response(400)
self.end_headers()
self.wfile.write(b'Username and password are required.')
return
glut = GLUTAcademic(username, password)
try:
glut.login()
except GLUTAcademic.LoginFailedError as e:
self.send_response(401)
self.end_headers()
self.wfile.write(str(e).encode())
return
res = glut.get_scores(year, getattr(Term, term))
pga = glut.calculate_scores(res)
# 将成绩数据保存到scores.txt
res = [i.as_dict() for i in res]
res_json = json.dumps(res, ensure_ascii=False, indent=4)
with open('scores.txt', 'w', encoding='utf-8') as f:
f.write(res_json)
response_data = {
'scores': res,
'pga': pga,
'cookies': glut.get_cookies(),
}
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
self.wfile.write(json.dumps(response_data, ensure_ascii=False, indent=4)
.encode('utf-8'))
else:
super().do_GET()
def run(server_class=HTTPServer, handler_class=RequestHandler, port=8000):
server_address = ('', port)
httpd = server_class(server_address, handler_class)
print(f'Starting httpd server on http://{httpd.server_address[0]}:{httpd.server_address[1]} ...')
httpd.serve_forever()
if __name__ == '__main__':
run()