mirror of
https://github.com/Stijnvandenbroek/stamp.git
synced 2026-01-16 23:46:54 +01:00
feat: improved csv parsing
This commit is contained in:
@@ -19,35 +19,43 @@ app.add_middleware(
|
|||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Pydantic models
|
# Pydantic models
|
||||||
class AnswerOption(BaseModel):
|
class AnswerOption(BaseModel):
|
||||||
text: str
|
text: str
|
||||||
is_correct: bool
|
is_correct: bool
|
||||||
|
|
||||||
|
|
||||||
class AnswerSubmission(BaseModel):
|
class AnswerSubmission(BaseModel):
|
||||||
session_id: str
|
session_id: str
|
||||||
selected_answers: list[str]
|
selected_answers: list[str]
|
||||||
|
|
||||||
|
|
||||||
class MoveQuestionRequest(BaseModel):
|
class MoveQuestionRequest(BaseModel):
|
||||||
session_id: str
|
session_id: str
|
||||||
question_index: int
|
question_index: int
|
||||||
|
|
||||||
|
|
||||||
class SessionResetRequest(BaseModel):
|
class SessionResetRequest(BaseModel):
|
||||||
session_id: str
|
session_id: str
|
||||||
|
|
||||||
|
|
||||||
class QuizSettings(BaseModel):
|
class QuizSettings(BaseModel):
|
||||||
repeat_on_mistake: bool
|
repeat_on_mistake: bool
|
||||||
shuffle_answers: bool
|
shuffle_answers: bool
|
||||||
randomise_order: bool
|
randomise_order: bool
|
||||||
question_count_multiplier: int
|
question_count_multiplier: int
|
||||||
|
|
||||||
|
|
||||||
# Global storage
|
# Global storage
|
||||||
quiz_sessions = {}
|
quiz_sessions = {}
|
||||||
|
|
||||||
|
|
||||||
# Utility functions
|
# Utility functions
|
||||||
def check_answers(selected_answers: list[str], correct_answers: list[str]) -> bool:
|
def check_answers(selected_answers: list[str], correct_answers: list[str]) -> bool:
|
||||||
return set(selected_answers) == set(correct_answers)
|
return set(selected_answers) == set(correct_answers)
|
||||||
|
|
||||||
|
|
||||||
def get_correct_answers(session_id: str) -> list[str]:
|
def get_correct_answers(session_id: str) -> list[str]:
|
||||||
session = quiz_sessions.get(session_id)
|
session = quiz_sessions.get(session_id)
|
||||||
if not session:
|
if not session:
|
||||||
@@ -55,7 +63,10 @@ def get_correct_answers(session_id: str) -> list[str]:
|
|||||||
|
|
||||||
df = session["data"]
|
df = session["data"]
|
||||||
question_index = session["current_question_index"]
|
question_index = session["current_question_index"]
|
||||||
return [ans["text"] for ans in df.loc[question_index]["answer"] if ans["is_correct"]]
|
return [
|
||||||
|
ans["text"] for ans in df.loc[question_index]["answer"] if ans["is_correct"]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def update_stats(session_id: str, is_correct: bool) -> None:
|
def update_stats(session_id: str, is_correct: bool) -> None:
|
||||||
session = quiz_sessions.get(session_id)
|
session = quiz_sessions.get(session_id)
|
||||||
@@ -69,19 +80,24 @@ def update_stats(session_id: str, is_correct: bool) -> None:
|
|||||||
else:
|
else:
|
||||||
session["incorrect_count"] += 1
|
session["incorrect_count"] += 1
|
||||||
|
|
||||||
|
|
||||||
def validate_session(session_id: str):
|
def validate_session(session_id: str):
|
||||||
session = quiz_sessions.get(session_id)
|
session = quiz_sessions.get(session_id)
|
||||||
if not session:
|
if not session:
|
||||||
raise HTTPException(status_code=400, detail="Invalid session ID.")
|
raise HTTPException(status_code=400, detail="Invalid session ID.")
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
# API endpoints
|
# API endpoints
|
||||||
@app.get("/", include_in_schema=False) # Hide from OpenAPI docs
|
@app.get("/", include_in_schema=False) # Hide from OpenAPI docs
|
||||||
async def root():
|
async def root():
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/upload-csv-with-settings/")
|
@app.post("/upload-csv-with-settings/")
|
||||||
async def upload_csv_with_settings(files: list[UploadFile] = File(...), settings: str = Form(...)):
|
async def upload_csv_with_settings(
|
||||||
|
files: list[UploadFile] = File(...), settings: str = Form(...)
|
||||||
|
):
|
||||||
print("Received upload request:")
|
print("Received upload request:")
|
||||||
print(f"Number of files: {len(files)}")
|
print(f"Number of files: {len(files)}")
|
||||||
print(f"File names: {[file.filename for file in files]}")
|
print(f"File names: {[file.filename for file in files]}")
|
||||||
@@ -92,29 +108,194 @@ async def upload_csv_with_settings(files: list[UploadFile] = File(...), settings
|
|||||||
print(f"Parsed settings: {quiz_settings}")
|
print(f"Parsed settings: {quiz_settings}")
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
print(f"JSON decode error: {e}")
|
print(f"JSON decode error: {e}")
|
||||||
return JSONResponse({"error": f"Invalid JSON in settings: {str(e)}"}, status_code=400)
|
return JSONResponse(
|
||||||
|
{"error": f"Invalid JSON in settings: {str(e)}"}, status_code=400
|
||||||
|
)
|
||||||
|
|
||||||
combined_df = pd.DataFrame()
|
combined_df = pd.DataFrame()
|
||||||
for file in files:
|
for file in files:
|
||||||
try:
|
try:
|
||||||
print(f"Processing file: {file.filename}")
|
print(f"Processing file: {file.filename}")
|
||||||
contents = await file.read()
|
contents = await file.read()
|
||||||
df = pd.read_csv(io.BytesIO(contents))
|
|
||||||
print(f"File columns: {df.columns.tolist()}")
|
|
||||||
|
|
||||||
if "question" not in df.columns or "answer" not in df.columns:
|
# Try pandas first - it's generally better at handling quoted CSV
|
||||||
return JSONResponse({"error": f"CSV file {file.filename} must have 'question' and 'answer' columns."}, status_code=400)
|
try:
|
||||||
|
df = pd.read_csv(
|
||||||
|
io.BytesIO(contents),
|
||||||
|
dtype=str, # Keep everything as strings initially
|
||||||
|
keep_default_na=False, # Don't convert to NaN
|
||||||
|
)
|
||||||
|
print("Pandas parsing successful!")
|
||||||
|
print(f"File columns: {df.columns.tolist()}")
|
||||||
|
print(f"DataFrame shape: {df.shape}")
|
||||||
|
|
||||||
|
if len(df.columns) != 2:
|
||||||
|
raise ValueError(f"Expected 2 columns, got {len(df.columns)}")
|
||||||
|
|
||||||
|
# Check if data looks correct (no fragments)
|
||||||
|
sample_answer = df.iloc[0].iloc[1] if len(df) > 0 else ""
|
||||||
|
if not sample_answer.strip().startswith(
|
||||||
|
"["
|
||||||
|
) or not sample_answer.strip().endswith("]"):
|
||||||
|
print(
|
||||||
|
"Data appears corrupted in pandas, falling back to manual parsing"
|
||||||
|
)
|
||||||
|
raise ValueError("Data corruption detected")
|
||||||
|
|
||||||
|
except Exception as pandas_error:
|
||||||
|
print(f"Pandas parsing failed: {pandas_error}")
|
||||||
|
print("Trying manual parsing...")
|
||||||
|
|
||||||
|
# Fallback to manual parsing with better regex handling
|
||||||
|
import re
|
||||||
|
|
||||||
|
contents_str = contents.decode("utf-8")
|
||||||
|
lines = contents_str.strip().split("\n")
|
||||||
|
|
||||||
|
# Parse header
|
||||||
|
header_line = lines[0]
|
||||||
|
header = [col.strip() for col in header_line.split(",")]
|
||||||
|
|
||||||
|
if len(header) != 2:
|
||||||
|
raise ValueError(
|
||||||
|
f"Header should have 2 columns, found {len(header)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse data with regex to handle quoted fields
|
||||||
|
data_rows = []
|
||||||
|
csv_pattern = r'"([^"]*(?:""[^"]*)*)","(\[.*\])"'
|
||||||
|
|
||||||
|
for i, line in enumerate(lines[1:], 1):
|
||||||
|
match = re.match(csv_pattern, line.strip())
|
||||||
|
if match:
|
||||||
|
question = match.group(1).replace(
|
||||||
|
'""', '"'
|
||||||
|
) # Handle escaped quotes
|
||||||
|
answer = match.group(2)
|
||||||
|
data_rows.append([question, answer])
|
||||||
|
else:
|
||||||
|
print(f"Warning: Could not parse line {i}: {line[:100]}...")
|
||||||
|
|
||||||
|
if not data_rows:
|
||||||
|
raise ValueError("No valid data rows found with manual parsing")
|
||||||
|
|
||||||
|
# Create DataFrame
|
||||||
|
df = pd.DataFrame(data_rows, columns=header)
|
||||||
|
print("Manual parsing successful!")
|
||||||
|
print(f"File columns: {df.columns.tolist()}")
|
||||||
|
print(f"DataFrame shape: {df.shape}")
|
||||||
|
|
||||||
|
print(f"Sample question: {df.iloc[0].iloc[0] if len(df) > 0 else 'N/A'}")
|
||||||
|
print(
|
||||||
|
f"Sample answer preview: {df.iloc[0].iloc[1][:50] if len(df) > 0 else 'N/A'}..."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for both capitalized and lowercase column names
|
||||||
|
columns = [col.lower() for col in df.columns]
|
||||||
|
if "question" not in columns or "answer" not in columns:
|
||||||
|
return JSONResponse(
|
||||||
|
{
|
||||||
|
"error": f"CSV file {file.filename} must have 'Question' and 'Answer' columns."
|
||||||
|
},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Normalize column names to lowercase
|
||||||
|
df.columns = df.columns.str.lower()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing file {file.filename}: {str(e)}")
|
print(f"Error processing file {file.filename}: {str(e)}")
|
||||||
return JSONResponse({"error": f"Error processing {file.filename}: {str(e)}"}, status_code=400)
|
return JSONResponse(
|
||||||
|
{"error": f"Error processing {file.filename}: {str(e)}"},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
df["answer"] = df["answer"].apply(
|
# Parse JSON answers with better error handling
|
||||||
lambda x: json.loads(x.replace("\\\\", "\\\\\\")) if isinstance(x, str) else x
|
def safe_json_parse(x):
|
||||||
)
|
if not isinstance(x, str):
|
||||||
|
return x
|
||||||
|
if not x.strip():
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
# Clean up the JSON string
|
||||||
|
cleaned = x.strip()
|
||||||
|
|
||||||
|
# Handle escaped quotes from CSV parsing
|
||||||
|
if '\\"' in cleaned:
|
||||||
|
cleaned = cleaned.replace('\\"', '"')
|
||||||
|
|
||||||
|
# Remove any extra quotes at the beginning and end
|
||||||
|
if cleaned.startswith('"') and cleaned.endswith('"'):
|
||||||
|
cleaned = cleaned[1:-1]
|
||||||
|
|
||||||
|
# Try to parse the JSON
|
||||||
|
parsed = json.loads(cleaned)
|
||||||
|
|
||||||
|
# Ensure we have a list of dictionaries
|
||||||
|
if not isinstance(parsed, list):
|
||||||
|
print(
|
||||||
|
f"Warning: Expected list but got {type(parsed)} for value: {repr(x[:50])}"
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Validate each item is a dictionary with required keys
|
||||||
|
valid_answers = []
|
||||||
|
for item in parsed:
|
||||||
|
if (
|
||||||
|
isinstance(item, dict)
|
||||||
|
and "text" in item
|
||||||
|
and "is_correct" in item
|
||||||
|
):
|
||||||
|
valid_answers.append(item)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"Warning: Invalid answer format - missing 'text' or 'is_correct': {item}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not valid_answers:
|
||||||
|
print(f"Warning: No valid answer options found in: {repr(x[:50])}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return valid_answers
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"JSON parsing error for value: {repr(x[:100])}")
|
||||||
|
print(f"Error: {e}")
|
||||||
|
# Try one more time with additional cleaning
|
||||||
|
try:
|
||||||
|
# More aggressive cleaning
|
||||||
|
cleaned = x.strip()
|
||||||
|
if cleaned.startswith('"') and cleaned.endswith('"'):
|
||||||
|
cleaned = cleaned[1:-1]
|
||||||
|
cleaned = cleaned.replace('\\"', '"')
|
||||||
|
cleaned = cleaned.replace("\\\\", "\\")
|
||||||
|
|
||||||
|
parsed = json.loads(cleaned)
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
return parsed
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return []
|
||||||
|
|
||||||
|
print("Parsing JSON answers...")
|
||||||
|
df["answer"] = df["answer"].apply(safe_json_parse)
|
||||||
|
|
||||||
|
# Filter out rows where JSON parsing failed (empty lists)
|
||||||
|
original_count = len(df)
|
||||||
|
df = df[df["answer"].apply(lambda x: len(x) > 0)]
|
||||||
|
if len(df) < original_count:
|
||||||
|
print(
|
||||||
|
f"Warning: Filtered out {original_count - len(df)} rows due to JSON parsing errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate that we have valid answer structures
|
||||||
|
print(f"Sample parsed answer: {df.iloc[0]['answer'] if len(df) > 0 else 'N/A'}")
|
||||||
|
|
||||||
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
||||||
|
|
||||||
combined_df = pd.concat([combined_df] * quiz_settings.question_count_multiplier, ignore_index=True)
|
combined_df = pd.concat(
|
||||||
|
[combined_df] * quiz_settings.question_count_multiplier, ignore_index=True
|
||||||
|
)
|
||||||
|
|
||||||
if quiz_settings.randomise_order:
|
if quiz_settings.randomise_order:
|
||||||
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
|
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
|
||||||
@@ -132,6 +313,7 @@ async def upload_csv_with_settings(files: list[UploadFile] = File(...), settings
|
|||||||
|
|
||||||
return {"session_id": session_id, "message": "Quiz session started!"}
|
return {"session_id": session_id, "message": "Quiz session started!"}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/quiz-settings/")
|
@app.get("/quiz-settings/")
|
||||||
async def get_quiz_settings(session_id: str):
|
async def get_quiz_settings(session_id: str):
|
||||||
session = quiz_sessions.get(session_id)
|
session = quiz_sessions.get(session_id)
|
||||||
@@ -140,6 +322,7 @@ async def get_quiz_settings(session_id: str):
|
|||||||
|
|
||||||
return session["settings"]
|
return session["settings"]
|
||||||
|
|
||||||
|
|
||||||
@app.get("/next-question/")
|
@app.get("/next-question/")
|
||||||
async def get_next_question(session_id: str):
|
async def get_next_question(session_id: str):
|
||||||
session = quiz_sessions.get(session_id)
|
session = quiz_sessions.get(session_id)
|
||||||
@@ -156,7 +339,9 @@ async def get_next_question(session_id: str):
|
|||||||
possible_answers = [ans["text"] for ans in df.loc[question_index]["answer"]]
|
possible_answers = [ans["text"] for ans in df.loc[question_index]["answer"]]
|
||||||
if session["settings"]["shuffle_answers"]:
|
if session["settings"]["shuffle_answers"]:
|
||||||
random.shuffle(possible_answers)
|
random.shuffle(possible_answers)
|
||||||
multiple_choice = len([ans for ans in df.loc[question_index]["answer"] if ans["is_correct"]]) > 1
|
multiple_choice = (
|
||||||
|
len([ans for ans in df.loc[question_index]["answer"] if ans["is_correct"]]) > 1
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"question": question,
|
"question": question,
|
||||||
@@ -165,6 +350,7 @@ async def get_next_question(session_id: str):
|
|||||||
"multiple_choice": multiple_choice,
|
"multiple_choice": multiple_choice,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/submit-answer/")
|
@app.post("/submit-answer/")
|
||||||
async def submit_answer(submission: AnswerSubmission):
|
async def submit_answer(submission: AnswerSubmission):
|
||||||
if submission.session_id not in quiz_sessions:
|
if submission.session_id not in quiz_sessions:
|
||||||
@@ -173,7 +359,11 @@ async def submit_answer(submission: AnswerSubmission):
|
|||||||
correct_answers = get_correct_answers(submission.session_id)
|
correct_answers = get_correct_answers(submission.session_id)
|
||||||
is_correct = check_answers(submission.selected_answers, correct_answers)
|
is_correct = check_answers(submission.selected_answers, correct_answers)
|
||||||
update_stats(submission.session_id, is_correct)
|
update_stats(submission.session_id, is_correct)
|
||||||
return {"result": "Correct" if is_correct else "Incorrect", "correct_answers": correct_answers}
|
return {
|
||||||
|
"result": "Correct" if is_correct else "Incorrect",
|
||||||
|
"correct_answers": correct_answers,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/quiz-stats/")
|
@app.get("/quiz-stats/")
|
||||||
async def get_quiz_stats(session_id: str):
|
async def get_quiz_stats(session_id: str):
|
||||||
@@ -191,6 +381,7 @@ async def get_quiz_stats(session_id: str):
|
|||||||
"incorrect_answers": incorrect_count,
|
"incorrect_answers": incorrect_count,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/move-question-to-bottom/")
|
@app.post("/move-question-to-bottom/")
|
||||||
async def move_question_to_bottom(request: MoveQuestionRequest):
|
async def move_question_to_bottom(request: MoveQuestionRequest):
|
||||||
session = validate_session(request.session_id)
|
session = validate_session(request.session_id)
|
||||||
@@ -210,6 +401,7 @@ async def move_question_to_bottom(request: MoveQuestionRequest):
|
|||||||
session["data"] = df
|
session["data"] = df
|
||||||
return {"message": "Question moved to the bottom successfully."}
|
return {"message": "Question moved to the bottom successfully."}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/reset-session/")
|
@app.post("/reset-session/")
|
||||||
async def reset_session(session_reset_request: SessionResetRequest):
|
async def reset_session(session_reset_request: SessionResetRequest):
|
||||||
session = quiz_sessions.get(session_reset_request.session_id)
|
session = quiz_sessions.get(session_reset_request.session_id)
|
||||||
@@ -223,6 +415,7 @@ async def reset_session(session_reset_request: SessionResetRequest):
|
|||||||
|
|
||||||
return {"message": "Session reset successfully."}
|
return {"message": "Session reset successfully."}
|
||||||
|
|
||||||
|
|
||||||
# Application startup
|
# Application startup
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("Starting server on 0.0.0.0:8000...")
|
print("Starting server on 0.0.0.0:8000...")
|
||||||
|
|||||||
Reference in New Issue
Block a user