commit 34770cfeffe0fd5b7b3f0b60ada76d05a25be4a6 Author: ZZY <2450266535@qq.com> Date: Thu Dec 12 22:13:16 2024 +0800 init: 添加 GLUT 学分统计爬虫项目 - 新增 README.md 文件,包含项目描述、初始化步骤和使用方法 - 添加 default.env 文件,用于配置环境变量 - 实现 glut.py,包含登录、获取成绩、解析成绩等功能 - 添加 index.html,提供 Web 界面展示成绩 - 实现 main.py,提供命令行接口 - 添加 requirements.txt,列出项目依赖 - 实现 server.py,提供 HTTP 服务接口 diff --git a/README.md b/README.md new file mode 100644 index 0000000..98443d8 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# GLUT spider web + +- 项目描述 + - 用于获取GLUT学分统计,即爬虫案例 + +- 初始化项目 + +```shell +# Windows +python -m venv .venv +pip install -r requirements.txt + +# Linux +python3 -m venv .venv +pip3 install -r requirements.txt +``` + +- 配置环境变量( **推荐配置** ) + + 将`default.env`内容填写完整后改名为`.env` + +- 使用方法 + +```shell +# 帮助文档 +python main.py -h + +# 开启服务器 +python main.py -s + +# 使用命令行操作,如果没有配置环境变量则需要给出用户名和密码 +# 注意默认会输出scores.json +> python main.py -t 2023_sp +Fetching scores for 2023 sp... +aggregate_score: 72.32824427480917 + +> python main.py -t 2023_sp -u 32220520xxxxx -p xxxxx +Fetching scores for 2023 sp... +aggregate_score: 72.32824427480917 +``` diff --git a/default.env b/default.env new file mode 100644 index 0000000..47025bf --- /dev/null +++ b/default.env @@ -0,0 +1,3 @@ +USERNAME=3220520xxxxx +PASSWORD=YourPassword +PORT=8000 diff --git a/glut.py b/glut.py new file mode 100644 index 0000000..20bf632 --- /dev/null +++ b/glut.py @@ -0,0 +1,179 @@ +from dataclasses import dataclass, asdict +from enum import Enum, auto +import requests +from lxml import etree + +class Term(Enum): + SPRING = auto() + AUTUMN = auto() + ALL = auto() + + def __repr__(self): + return self.name.lower() + + +class GLUTAcademic: + class LoginFailedError(Exception): + pass + + FINAL_GRADE_DICT = { + '优秀': 95, + '良': 85, + '中': 75, + '及格': 65, + '不及格': 40, + '': 0, + } + + @dataclass + class Score: + year: int + term: Term + department: str + course_number: int + course_name: str + course_sequence: int + instructor: str + final_grade: int + gpa: float + credits: float + hours: float + assessment_method: str + course_attribute: str + notes: str + exam_type: str + retake_flag: str + course_requirements: str + course_category: str + coefficient: str + second_degree_minor: str + pass_flag: str + + def as_dict(self): + d = asdict(self) + d['term'] = repr(self.term) + return d + + def __str__(self): + return (f"Score(学年={self.year}, 学期={self.term}, 开课院系={self.department}, " + f"课程号={self.course_number}, 课程名={self.course_name}, 课序号={self.course_sequence}, " + f"主讲教师={self.instructor}, 总评={self.final_grade}, 绩点={self.gpa}, " + f"学分={self.credits}, 学时={self.hours}, 考核方式={self.assessment_method}, " + f"选课属性={self.course_attribute}, 备注={self.notes}, 考试性质={self.exam_type}, " + f"是否缓考={self.retake_flag}, 课程要求={self.course_requirements}, " + f"课程类别={self.course_category}, 系数={self.coefficient}, " + f"二学位辅修={self.second_degree_minor}, 及格标志={self.pass_flag})") + + def __init__(self, username: int, password: str): + self.session = requests.Session() + self.username = username + self.password = password + self.login_status = False + self.base_url = 'https://jw.glut.edu.cn' + + def login(self): + url = f"{self.base_url}/academic/j_acegi_security_check"\ + f"?j_username={self.username}&j_password={self.password}&j_captcha=undefined" + res = self.session.get(url) + if res.status_code == 200 and res.headers.get('X-Frame-Options') != 'DENY': + self.login_status = True + else: + raise self.LoginFailedError('login error maybe username or password is wrong') + + def get_cookies(self): + if not self.login_status: + return None + return '; '.join([f'{key}={value}' for key, value in self.session.cookies.items()]) + + def get_scores_raw(self, year: int = 2024, term: Term = Term.ALL): + url = f"{self.base_url}/academic/manager/score/studentOwnScore.do" + headers = {'Content-Type': 'application/x-www-form-urlencoded'} + term_dict = { + Term.SPRING: "1", + Term.AUTUMN: "2", + Term.ALL: "" + } + + data = { + 'year': year - 2013 + 33, + 'term': term_dict[term], + 'prop': '', + 'groupName': '', + 'para': '0', + 'sortColumn': '', + 'Submit': '查询' + } + res = self.session.post(url, data=data, headers=headers) + + if res.status_code != 200 or res.headers.get('X-Frame-Options') == 'DENY': + raise RuntimeError("Failed to get scores") + return res.content + + def parse_scores(self, raw_score: bytes) -> list[Score]: + tree = etree.fromstring(raw_score, parser=etree.HTMLParser(encoding='utf-8')) + table = tree.xpath('//table[@class="datalist"]') + if not table or len(table) != 1: + err_msg = f"Failed to parse scores {table}" + with open('error.html', 'wb') as f: + f.write(raw_score) + raise RuntimeError(err_msg) + table = table[0] + tr_list = table.xpath('tr')[1:] + return [self.parse_scope(tr.xpath('td/text()')) for tr in tr_list] + + def parse_scope(self, texts: list[str]) -> Score: + texts = [t.strip() for t in texts] + final_grade = self.FINAL_GRADE_DICT.get(texts[7], -1) + if final_grade == -1: + try: + final_grade = float(texts[7]) + except ValueError as exc: + raise ValueError('Final grade is not a int number') from exc + + term_map = { + '春': Term.SPRING, + '秋': Term.AUTUMN + } + term = term_map.get(texts[1]) + if term is None: + raise ValueError(f'term error maybe 春 or 秋 but got {texts[1]}') + + return self.Score( + year = int(texts[0]), + term = term, + department= texts[2], + course_number = int(texts[3]), + course_name = texts[4], + course_sequence = int(texts[5]), + instructor = texts[6], + final_grade = final_grade, + gpa = float(texts[8]), + credits = float(texts[9]), + hours = float(texts[10]), + assessment_method = texts[11], + course_attribute = texts[12], + notes = texts[13], + exam_type = texts[14], + retake_flag = texts[15], + course_requirements = texts[16], + course_category = texts[17], + coefficient = texts[18], + second_degree_minor = texts[19], + pass_flag = texts[20] + ) + + def get_scores(self, year: int = 2024, term: Term = Term.ALL) -> list[Score]: + raw_score = self.get_scores_raw(year, term) + return self.parse_scores(raw_score) + + def calculate_scores(self, scores: list[Score]) -> float: + if not scores: + return 0.0 + total_credits = 0 + total_grades = 0 + for i in scores: + if i.course_attribute != '必修' or i.course_name.startswith('体育'): + continue + total_credits += i.credits + total_grades += i.final_grade * i.credits + return total_grades / total_credits diff --git a/index.html b/index.html new file mode 100644 index 0000000..1e8f393 --- /dev/null +++ b/index.html @@ -0,0 +1,137 @@ + + + + + + 成绩展示 + + + +

成绩展示

+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+ + + +
PGA:
+ + + + + + + + + + +
课程名总评成绩绩点
+ + + + + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..0d53c9a --- /dev/null +++ b/main.py @@ -0,0 +1,72 @@ +import json +import argparse +from os import getenv +from dotenv import load_dotenv +from glut import GLUTAcademic, Term +from server import run as server_run + +load_dotenv() + +def fetch_scores(glut_academic: GLUTAcademic, terms: list[str]): + all_scores:list[GLUTAcademic.Score] = [] + for term in terms: + year, term_str = term.split('_') + term_str = term_str.lower() + + if term_str == 'sp': + term = Term.SPRING + elif term_str == 'au': + term = Term.AUTUMN + else: + raise ValueError(f'Invalid term: {term}, must be one of 2023_sp, 2024_au') + + print(f'Fetching scores for {year} {term_str}...') + all_scores.extend( + glut_academic.get_scores(int(year), term) + ) + + return json.dumps([score.as_dict() for score in all_scores], + ensure_ascii=False, indent=4), all_scores + +def main(): + parser = argparse.ArgumentParser( + description='GLUT Academic Score Calculator', + ) + parser.add_argument('-u', '--username', type=str, default=getenv('USERNAME', default=None)) + parser.add_argument('-p', '--password', type=str, default=getenv('PASSWORD', default=None)) + parser.add_argument('-s', '--server', action='store_true', + help='Run the server instead of fetching scores') + parser.add_argument('-t', '--terms', type=str, nargs='+', + help='Year and term combinations (e.g., 2023_sp 2024_au)') + parser.add_argument('-o', '--output', type=str, default='scores.json', + help='Output file name (default: scores.json)') + parser.add_argument('-q', '--quiet', action='store_true', default=True, + help='Suppress terminal output') + parser.add_argument('--disable-file-output', action='store_true', + help='Disable file output') + parser.add_argument('--port', type=int, default=int(getenv('PORT', default='8000')), + help='Port number for the server') + + args = parser.parse_args() + + if not args.username or not args.password: + raise ValueError("USERNAME and PASSWORD must be set in the .env file") + + if args.server: + server_run(port=args.port) + else: + glut_acdemic = GLUTAcademic(int(args.username), args.password) + glut_acdemic.login() + + res, origin_li = fetch_scores(glut_acdemic, args.terms) + + if not args.disable_file_output: + with open('scores.json', 'w', encoding='utf-8') as f: + f.write(res) + + print(f"aggregate_score: {glut_acdemic.calculate_scores(origin_li)}") + if not args.quiet: + print(res) + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2e3e331 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests +lxml +python-dotenv \ No newline at end of file diff --git a/server.py b/server.py new file mode 100644 index 0000000..07e71e3 --- /dev/null +++ b/server.py @@ -0,0 +1,62 @@ +import json +from http.server import SimpleHTTPRequestHandler, HTTPServer +from urllib.parse import urlparse, parse_qs +from glut import GLUTAcademic, Term + +class RequestHandler(SimpleHTTPRequestHandler): + def do_GET(self): + parsed_path = urlparse(self.path) + query_params = parse_qs(parsed_path.query) + + if parsed_path.path == '/get_scores': + year = int(query_params.get('year', [2023])[0]) + term = query_params.get('term', ['SUMMER'])[0] + username = query_params.get('username', [None])[0] + password = query_params.get('password', [None])[0] + + if not username or not password: + self.send_response(400) + self.end_headers() + self.wfile.write(b'Username and password are required.') + return + + glut = GLUTAcademic(username, password) + try: + glut.login() + except GLUTAcademic.LoginFailedError as e: + self.send_response(401) + self.end_headers() + self.wfile.write(str(e).encode()) + return + + res = glut.get_scores(year, getattr(Term, term)) + pga = glut.calculate_scores(res) + + # 将成绩数据保存到scores.txt + res = [i.as_dict() for i in res] + res_json = json.dumps(res, ensure_ascii=False, indent=4) + with open('scores.txt', 'w', encoding='utf-8') as f: + f.write(res_json) + + response_data = { + 'scores': res, + 'pga': pga, + 'cookies': glut.get_cookies(), + } + + self.send_response(200) + self.send_header('Content-type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(response_data, ensure_ascii=False, indent=4) + .encode('utf-8')) + else: + super().do_GET() + +def run(server_class=HTTPServer, handler_class=RequestHandler, port=8000): + server_address = ('', port) + httpd = server_class(server_address, handler_class) + print(f'Starting httpd server on http://{httpd.server_address[0]}:{httpd.server_address[1]} ...') + httpd.serve_forever() + +if __name__ == '__main__': + run()