From 794e5d256f5e6bbc06d954e1d0fe20067c3f108b Mon Sep 17 00:00:00 2001 From: Stijnvandenbroek <70574420+Stijnvandenbroek@users.noreply.github.com> Date: Tue, 12 Aug 2025 16:07:20 +0200 Subject: [PATCH] feat: improved csv parsing --- backend/src/main.py | 223 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 208 insertions(+), 15 deletions(-) diff --git a/backend/src/main.py b/backend/src/main.py index f23a179..a645f44 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -19,35 +19,43 @@ app.add_middleware( allow_headers=["*"], ) + # Pydantic models class AnswerOption(BaseModel): text: str is_correct: bool + class AnswerSubmission(BaseModel): session_id: str selected_answers: list[str] + class MoveQuestionRequest(BaseModel): session_id: str question_index: int + class SessionResetRequest(BaseModel): session_id: str + class QuizSettings(BaseModel): repeat_on_mistake: bool shuffle_answers: bool randomise_order: bool question_count_multiplier: int + # Global storage quiz_sessions = {} + # Utility functions def check_answers(selected_answers: list[str], correct_answers: list[str]) -> bool: return set(selected_answers) == set(correct_answers) + def get_correct_answers(session_id: str) -> list[str]: session = quiz_sessions.get(session_id) if not session: @@ -55,7 +63,10 @@ def get_correct_answers(session_id: str) -> list[str]: df = session["data"] question_index = session["current_question_index"] - return [ans["text"] for ans in df.loc[question_index]["answer"] if ans["is_correct"]] + return [ + ans["text"] for ans in df.loc[question_index]["answer"] if ans["is_correct"] + ] + def update_stats(session_id: str, is_correct: bool) -> None: session = quiz_sessions.get(session_id) @@ -69,52 +80,222 @@ def update_stats(session_id: str, is_correct: bool) -> None: else: session["incorrect_count"] += 1 + def validate_session(session_id: str): session = quiz_sessions.get(session_id) if not session: raise HTTPException(status_code=400, detail="Invalid session ID.") return session + # API endpoints @app.get("/", include_in_schema=False) # Hide from OpenAPI docs async def root(): return {"status": "ok"} + @app.post("/upload-csv-with-settings/") -async def upload_csv_with_settings(files: list[UploadFile] = File(...), settings: str = Form(...)): +async def upload_csv_with_settings( + files: list[UploadFile] = File(...), settings: str = Form(...) +): print("Received upload request:") print(f"Number of files: {len(files)}") print(f"File names: {[file.filename for file in files]}") print(f"Settings: {settings}") - + try: quiz_settings = QuizSettings.parse_raw(settings) print(f"Parsed settings: {quiz_settings}") except json.JSONDecodeError as e: print(f"JSON decode error: {e}") - return JSONResponse({"error": f"Invalid JSON in settings: {str(e)}"}, status_code=400) + return JSONResponse( + {"error": f"Invalid JSON in settings: {str(e)}"}, status_code=400 + ) combined_df = pd.DataFrame() for file in files: try: print(f"Processing file: {file.filename}") contents = await file.read() - df = pd.read_csv(io.BytesIO(contents)) - print(f"File columns: {df.columns.tolist()}") - if "question" not in df.columns or "answer" not in df.columns: - return JSONResponse({"error": f"CSV file {file.filename} must have 'question' and 'answer' columns."}, status_code=400) + # Try pandas first - it's generally better at handling quoted CSV + try: + df = pd.read_csv( + io.BytesIO(contents), + dtype=str, # Keep everything as strings initially + keep_default_na=False, # Don't convert to NaN + ) + print("Pandas parsing successful!") + print(f"File columns: {df.columns.tolist()}") + print(f"DataFrame shape: {df.shape}") + + if len(df.columns) != 2: + raise ValueError(f"Expected 2 columns, got {len(df.columns)}") + + # Check if data looks correct (no fragments) + sample_answer = df.iloc[0].iloc[1] if len(df) > 0 else "" + if not sample_answer.strip().startswith( + "[" + ) or not sample_answer.strip().endswith("]"): + print( + "Data appears corrupted in pandas, falling back to manual parsing" + ) + raise ValueError("Data corruption detected") + + except Exception as pandas_error: + print(f"Pandas parsing failed: {pandas_error}") + print("Trying manual parsing...") + + # Fallback to manual parsing with better regex handling + import re + + contents_str = contents.decode("utf-8") + lines = contents_str.strip().split("\n") + + # Parse header + header_line = lines[0] + header = [col.strip() for col in header_line.split(",")] + + if len(header) != 2: + raise ValueError( + f"Header should have 2 columns, found {len(header)}" + ) + + # Parse data with regex to handle quoted fields + data_rows = [] + csv_pattern = r'"([^"]*(?:""[^"]*)*)","(\[.*\])"' + + for i, line in enumerate(lines[1:], 1): + match = re.match(csv_pattern, line.strip()) + if match: + question = match.group(1).replace( + '""', '"' + ) # Handle escaped quotes + answer = match.group(2) + data_rows.append([question, answer]) + else: + print(f"Warning: Could not parse line {i}: {line[:100]}...") + + if not data_rows: + raise ValueError("No valid data rows found with manual parsing") + + # Create DataFrame + df = pd.DataFrame(data_rows, columns=header) + print("Manual parsing successful!") + print(f"File columns: {df.columns.tolist()}") + print(f"DataFrame shape: {df.shape}") + + print(f"Sample question: {df.iloc[0].iloc[0] if len(df) > 0 else 'N/A'}") + print( + f"Sample answer preview: {df.iloc[0].iloc[1][:50] if len(df) > 0 else 'N/A'}..." + ) + + # Check for both capitalized and lowercase column names + columns = [col.lower() for col in df.columns] + if "question" not in columns or "answer" not in columns: + return JSONResponse( + { + "error": f"CSV file {file.filename} must have 'Question' and 'Answer' columns." + }, + status_code=400, + ) + + # Normalize column names to lowercase + df.columns = df.columns.str.lower() + except Exception as e: print(f"Error processing file {file.filename}: {str(e)}") - return JSONResponse({"error": f"Error processing {file.filename}: {str(e)}"}, status_code=400) + return JSONResponse( + {"error": f"Error processing {file.filename}: {str(e)}"}, + status_code=400, + ) - df["answer"] = df["answer"].apply( - lambda x: json.loads(x.replace("\\\\", "\\\\\\")) if isinstance(x, str) else x - ) + # Parse JSON answers with better error handling + def safe_json_parse(x): + if not isinstance(x, str): + return x + if not x.strip(): + return [] + try: + # Clean up the JSON string + cleaned = x.strip() + + # Handle escaped quotes from CSV parsing + if '\\"' in cleaned: + cleaned = cleaned.replace('\\"', '"') + + # Remove any extra quotes at the beginning and end + if cleaned.startswith('"') and cleaned.endswith('"'): + cleaned = cleaned[1:-1] + + # Try to parse the JSON + parsed = json.loads(cleaned) + + # Ensure we have a list of dictionaries + if not isinstance(parsed, list): + print( + f"Warning: Expected list but got {type(parsed)} for value: {repr(x[:50])}" + ) + return [] + + # Validate each item is a dictionary with required keys + valid_answers = [] + for item in parsed: + if ( + isinstance(item, dict) + and "text" in item + and "is_correct" in item + ): + valid_answers.append(item) + else: + print( + f"Warning: Invalid answer format - missing 'text' or 'is_correct': {item}" + ) + + if not valid_answers: + print(f"Warning: No valid answer options found in: {repr(x[:50])}") + return [] + + return valid_answers + + except json.JSONDecodeError as e: + print(f"JSON parsing error for value: {repr(x[:100])}") + print(f"Error: {e}") + # Try one more time with additional cleaning + try: + # More aggressive cleaning + cleaned = x.strip() + if cleaned.startswith('"') and cleaned.endswith('"'): + cleaned = cleaned[1:-1] + cleaned = cleaned.replace('\\"', '"') + cleaned = cleaned.replace("\\\\", "\\") + + parsed = json.loads(cleaned) + if isinstance(parsed, list): + return parsed + except Exception: + pass + return [] + + print("Parsing JSON answers...") + df["answer"] = df["answer"].apply(safe_json_parse) + + # Filter out rows where JSON parsing failed (empty lists) + original_count = len(df) + df = df[df["answer"].apply(lambda x: len(x) > 0)] + if len(df) < original_count: + print( + f"Warning: Filtered out {original_count - len(df)} rows due to JSON parsing errors" + ) + + # Validate that we have valid answer structures + print(f"Sample parsed answer: {df.iloc[0]['answer'] if len(df) > 0 else 'N/A'}") combined_df = pd.concat([combined_df, df], ignore_index=True) - combined_df = pd.concat([combined_df] * quiz_settings.question_count_multiplier, ignore_index=True) + combined_df = pd.concat( + [combined_df] * quiz_settings.question_count_multiplier, ignore_index=True + ) if quiz_settings.randomise_order: combined_df = combined_df.sample(frac=1).reset_index(drop=True) @@ -132,6 +313,7 @@ async def upload_csv_with_settings(files: list[UploadFile] = File(...), settings return {"session_id": session_id, "message": "Quiz session started!"} + @app.get("/quiz-settings/") async def get_quiz_settings(session_id: str): session = quiz_sessions.get(session_id) @@ -140,6 +322,7 @@ async def get_quiz_settings(session_id: str): return session["settings"] + @app.get("/next-question/") async def get_next_question(session_id: str): session = quiz_sessions.get(session_id) @@ -156,7 +339,9 @@ async def get_next_question(session_id: str): possible_answers = [ans["text"] for ans in df.loc[question_index]["answer"]] if session["settings"]["shuffle_answers"]: random.shuffle(possible_answers) - multiple_choice = len([ans for ans in df.loc[question_index]["answer"] if ans["is_correct"]]) > 1 + multiple_choice = ( + len([ans for ans in df.loc[question_index]["answer"] if ans["is_correct"]]) > 1 + ) return { "question": question, @@ -165,6 +350,7 @@ async def get_next_question(session_id: str): "multiple_choice": multiple_choice, } + @app.post("/submit-answer/") async def submit_answer(submission: AnswerSubmission): if submission.session_id not in quiz_sessions: @@ -173,7 +359,11 @@ async def submit_answer(submission: AnswerSubmission): correct_answers = get_correct_answers(submission.session_id) is_correct = check_answers(submission.selected_answers, correct_answers) update_stats(submission.session_id, is_correct) - return {"result": "Correct" if is_correct else "Incorrect", "correct_answers": correct_answers} + return { + "result": "Correct" if is_correct else "Incorrect", + "correct_answers": correct_answers, + } + @app.get("/quiz-stats/") async def get_quiz_stats(session_id: str): @@ -191,6 +381,7 @@ async def get_quiz_stats(session_id: str): "incorrect_answers": incorrect_count, } + @app.post("/move-question-to-bottom/") async def move_question_to_bottom(request: MoveQuestionRequest): session = validate_session(request.session_id) @@ -210,6 +401,7 @@ async def move_question_to_bottom(request: MoveQuestionRequest): session["data"] = df return {"message": "Question moved to the bottom successfully."} + @app.post("/reset-session/") async def reset_session(session_reset_request: SessionResetRequest): session = quiz_sessions.get(session_reset_request.session_id) @@ -223,6 +415,7 @@ async def reset_session(session_reset_request: SessionResetRequest): return {"message": "Session reset successfully."} + # Application startup if __name__ == "__main__": print("Starting server on 0.0.0.0:8000...")