#!/usr/bin/env python3
"""
LLM Tournament Test Suite

This script tests solutions from all rounds and models on a given input file,
and collects metrics on the results.
"""

import os
import time
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import io
import re
from dateutil.parser import parse

class o3_miniRound0Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return o3_miniRound0Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # List of known delimiters (other than comma) that we will standardize to comma.
        additional_delims = [';', '\t', '|']
        cleaned_lines = []
        
        # Split the CSV data into lines and drop those that are completely empty.
        for line in csv_data.splitlines():
            stripped_line = line.strip()
            if not stripped_line:
                continue   # skip blank lines
            
            # Replace any occurrence of our additional delimiters with comma.
            new_line = stripped_line
            for d in additional_delims:
                new_line = new_line.replace(d, ',')
            
            # Fallback: if after our replacements there is no comma but the line seems to contain multiple fields,
            # check for two (or more) whitespace separation and rejoin by comma.
            if ',' not in new_line:
                # If there are at least two chunks separated by two or more spaces
                fields = re.split(r'\s{2,}', new_line)
                if len(fields) > 1:
                    new_line = ','.join(fields)
            cleaned_lines.append(new_line)
        
        # Join back into a single normalized CSV text.
        unified_csv = "\n".join(cleaned_lines)
        
        # Now use csv.reader to parse the (hopefully) delimited rows.
        # We use io.StringIO to wrap our text.
        reader = csv.reader(io.StringIO(unified_csv), delimiter=',')
        rows = list(reader)
        if not rows:
            return ""
        
        # Normalize the header row: trim and lowercase names, and replace whitespace with underscores.
        header = []
        for col in rows[0]:
            c = col.strip().lower()
            # replace internal spaces with underscore and remove any stray non-alnum characters (except underscores)
            c = re.sub(r'\s+', '_', c)
            c = re.sub(r'[^\w]', '', c)
            header.append(c)
        
        normalized_rows = [header]
        
        # Now process each non-header row.
        for row in rows[1:]:
            # If the entire row is empty (or consisting solely of empty/whitespace values) skip it.
            if all(x.strip() == "" for x in row):
                continue
            normalized_row = [o3_miniRound0Solution.normalize_value(field) for field in row]
            normalized_rows.append(normalized_row)
        
        # Write normalized data using csv.writer so that fields are properly escaped.
        out_io = io.StringIO()
        writer = csv.writer(out_io, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        for row in normalized_rows:
            writer.writerow(row)
        
        return out_io.getvalue()

    @staticmethod
    def remove_outer_quotes(s: str) -> str:
        """Strip leading and trailing whitespace and remove matching single or double quotes."""
        s = s.strip()
        if len(s) >= 2:
            # Check for matching double quotes or single quotes
            if (s[0] == s[-1]) and s[0] in ('"', "'"):
                return s[1:-1].strip()
        return s

    @staticmethod
    def try_normalize_date(s: str) -> str:
        """Try to parse a date and reformat to ISO YYYY-MM-DD.
           Returns the ISO string if successful, or None."""
        try:
            # Use fuzzy parsing to handle things like "May 3rd, 1992"
            dt = parse(s, fuzzy=True, dayfirst=False)
            return dt.strftime('%Y-%m-%d')
        except Exception:
            return None

    @staticmethod
    def try_normalize_number(s: str) -> str:
        """Try to clean up a number string.
           It removes currency symbols and standardizes thousand and decimal separators.
           Returns the normalized number as a string if successful; otherwise None."""
        # Remove common currency symbols and spaces.
        cleaned = re.sub(r'[€$]', '', s).strip()
        # First attempt: Assume US/International style where comma is a thousand separator.
        try:
            # Remove commas then convert to float.
            num = float(cleaned.replace(',', ''))
            # We output in standard format.
            return f"{num}"
        except Exception:
            pass
        # Second attempt: Assume European format, e.g. "1.000,00"
        try:
            # Remove dots as thousand separators and replace comma with dot.
            cleaned2 = cleaned.replace('.', '').replace(',', '.')
            num = float(cleaned2)
            return f"{num}"
        except Exception:
            return None

    @staticmethod
    def normalize_value(val: str) -> str:
        v = o3_miniRound0Solution.remove_outer_quotes(val)
        v = v.strip()
        if v == "":
            return ""  # treat empty as missing
        
        lower_v = v.lower()

        # Handle booleans explicitly.
        if lower_v in ('yes', 'true'):
            return "true"
        if lower_v in ('no', 'false'):
            return "false"
        
        # Try to normalize date if possible.
        date_norm = o3_miniRound0Solution.try_normalize_date(v)
        if date_norm is not None:
            return date_norm
        
        # Try to normalize the number.
        # We only attempt this if v has digits in it.
        if re.search(r'\d', v):
            num_norm = o3_miniRound0Solution.try_normalize_number(v)
            if num_norm is not None:
                return num_norm
        
        # Otherwise, return the cleaned string.
        return v
from dateutil import parser as date_parser
import locale

class gpt4oRound0Solution:
    @staticmethod
    def solve(input_text):
        return gpt4oRound0Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        # Prepare to read the CSV data
        # Split the input data into lines
        lines = csv_data.strip().splitlines()
        
        # Detect the delimiter by checking the first few lines
        possible_delimiters = [',', ';', '\t', '|']
        delimiter_counts = {d: 0 for d in possible_delimiters}

        for line in lines[:10]:  # Check the first few lines for delimiter detection
            for delim in possible_delimiters:
                delimiter_counts[delim] += line.count(delim)
        
        # Choose the most common delimiter
        delimiter = max(delimiter_counts, key=delimiter_counts.get)
        
        # Use csv.reader to handle the CSV parsing
        csv_reader = csv.reader(lines, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL, skipinitialspace=True)
        data = list(csv_reader)
        
        # Normalize column names
        headers = data[0]
        normalized_headers = [re.sub(r'\s+', '_', h.strip().lower()) for h in headers]
        
        normalized_data = [normalized_headers]
        
        locale.setlocale(locale.LC_NUMERIC, '')  # Use user's preferred locale settings
        
        for row in data[1:]:
            if len(row) == 0 or all(not cell.strip() for cell in row):
                continue  # Skip empty rows
            
            normalized_row = []
            for cell in row:
                cell = cell.strip()
                
                # Handle missing values
                if cell == '' or cell.lower() in {'null', 'none'}:
                    normalized_row.append('')
                    continue
                
                # Handle numeric values
                try:
                    # Remove currency symbols and thousand separators, replace commas with dots for decimals
                    number = locale.atof(re.sub(r'[^\d,.-]', '', cell))
                    normalized_row.append(f"{number:.2f}")
                    continue
                except (ValueError, locale.Error):
                    pass
                
                # Handle dates
                try:
                    date = date_parser.parse(cell)
                    normalized_row.append(date.strftime('%Y-%m-%d'))
                    continue
                except (ValueError, OverflowError):
                    pass
                
                # If no special handling needed, keep the original value
                normalized_row.append(cell)
            
            normalized_data.append(normalized_row)
        
        # Write normalized data back to CSV format
        output = io.StringIO()
        csv_writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_ALL)
        csv_writer.writerows(normalized_data)
        
        return output.getvalue()
import chardet
import dateutil.parser
from datetime import datetime

class claude37Round0Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return claude37Round0Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle character encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')
        
        # Remove BOM if present
        csv_data = csv_data.lstrip('\ufeff')
        
        # Normalize line endings
        csv_data = csv_data.replace('\r\n', '\n').replace('\r', '\n')
        
        # Skip completely empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect delimiter
        delimiter = claude37Round0Solution.detect_delimiter(lines)
        
        # Parse CSV with the detected delimiter
        reader = csv.reader(io.StringIO('\n'.join(lines)), delimiter=delimiter, quotechar='"')
        rows = list(reader)
        
        if not rows:
            return ""
        
        # Normalize header row
        header = [claude37Round0Solution.normalize_column_name(col.strip()) for col in rows[0]]
        
        # Process data rows
        cleaned_rows = [header]
        for row in rows[1:]:
            # Skip rows that don't match header length after attempting to fix
            adjusted_row = claude37Round0Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue
                
            # Clean each field
            cleaned_row = []
            for i, field in enumerate(adjusted_row):
                col_name = header[i] if i < len(header) else f"column_{i}"
                cleaned_field = claude37Round0Solution.clean_field(field, col_name)
                cleaned_row.append(cleaned_field)
            
            cleaned_rows.append(cleaned_row)
        
        # Write the cleaned data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(cleaned_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines):
        """Detect the most likely delimiter in the CSV data."""
        common_delimiters = [',', ';', '\t', '|']
        
        # Count occurrences of each delimiter in each line
        delimiter_counts = {d: [] for d in common_delimiters}
        
        for line in lines:
            # Skip empty lines
            if not line.strip():
                continue
                
            # Count delimiters outside of quoted sections
            for delimiter in common_delimiters:
                count = claude37Round0Solution.count_delimiters_outside_quotes(line, delimiter)
                delimiter_counts[delimiter].append(count)
        
        # Find the delimiter that has the most consistent non-zero count
        best_delimiter = ','  # Default to comma
        best_consistency = 0
        
        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue
                
            # Filter out zeros
            non_zero_counts = [c for c in counts if c > 0]
            if not non_zero_counts:
                continue
                
            # Calculate consistency (how many lines have the same count)
            most_common_count = max(set(non_zero_counts), key=non_zero_counts.count)
            consistency = non_zero_counts.count(most_common_count) / len(non_zero_counts)
            
            if consistency > best_consistency:
                best_delimiter = delimiter
                best_consistency = consistency
        
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line, delimiter):
        """Count delimiters that are outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        
        for i, char in enumerate(line):
            # Toggle quote state
            if char in ['"', "'"]:
                # Check if this quote is escaped
                if i > 0 and line[i-1] == '\\':
                    continue
                    
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            
            # Count delimiter if outside quotes
            elif char == delimiter and not in_quotes:
                count += 1
                
        return count

    @staticmethod
    def normalize_column_name(name):
        """Normalize column name to lowercase with underscores."""
        # Replace non-alphanumeric chars with underscores
        name = re.sub(r'[^\w\s]', '_', name)
        # Replace spaces with underscores and convert to lowercase
        name = re.sub(r'\s+', '_', name).lower()
        # Remove consecutive underscores
        name = re.sub(r'_+', '_', name)
        # Remove leading/trailing underscores
        name = name.strip('_')
        return name

    @staticmethod
    def adjust_row_length(row, expected_length):
        """Adjust row to match expected length."""
        if len(row) == expected_length:
            return row
        
        # Try to combine fields that might have been incorrectly split
        if len(row) > expected_length:
            # Look for quotes that might indicate fields that should be combined
            new_row = []
            i = 0
            while i < len(row):
                field = row[i]
                # If we've already reached expected length, combine all remaining fields
                if len(new_row) == expected_length - 1:
                    new_row.append(' '.join(row[i:]))
                    break
                    
                # Check if this field starts with a quote but doesn't end with one
                if (field.startswith('"') and not field.endswith('"')) or \
                   (field.startswith("'") and not field.endswith("'")):
                    # Find the matching end quote
                    combined = field
                    j = i + 1
                    found_end = False
                    while j < len(row):
                        combined += " " + row[j]
                        if (field.startswith('"') and row[j].endswith('"')) or \
                           (field.startswith("'") and row[j].endswith("'")):
                            found_end = True
                            break
                        j += 1
                    
                    if found_end:
                        new_row.append(combined)
                        i = j + 1
                    else:
                        new_row.append(field)
                        i += 1
                else:
                    new_row.append(field)
                    i += 1
                    
            if len(new_row) == expected_length:
                return new_row
        
        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If row is still too long, truncate
        if len(row) > expected_length:
            return row[:expected_length]
        
        return row

    @staticmethod
    def clean_field(field, column_name):
        """Clean and normalize a field value based on content and column name."""
        # Remove leading/trailing quotes and whitespace
        field = field.strip()
        if (field.startswith('"') and field.endswith('"')) or \
           (field.startswith("'") and field.endswith("'")):
            field = field[1:-1].strip()
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if claude37Round0Solution.looks_like_date_column(column_name):
            try:
                date_obj = claude37Round0Solution.parse_date(field)
                if date_obj:
                    return date_obj.strftime('%Y-%m-%d')
            except:
                pass  # If date parsing fails, continue with other cleaning
        
        # Try to detect and normalize numeric fields
        if claude37Round0Solution.looks_like_numeric(field):
            try:
                return claude37Round0Solution.format_number(field)
            except:
                pass  # If number parsing fails, return cleaned string
        
        return field

    @staticmethod
    def looks_like_date_column(column_name):
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def parse_date(date_str):
        """Parse a date string in various formats."""
        # Remove currency symbols and other non-date characters
        date_str = re.sub(r'[$€£]', '', date_str)
        
        # Handle special formats like "May 3rd, 1992"
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
        
        try:
            return dateutil.parser.parse(date_str, fuzzy=True)
        except:
            return None

    @staticmethod
    def looks_like_numeric(field):
        """Check if field looks like it contains a number."""
        # Remove currency symbols, commas, spaces
        cleaned = re.sub(r'[$€£\s,]', '', field)
        # Replace European decimal separator
        cleaned = cleaned.replace(',', '.')
        
        # Check if it's a number
        return bool(re.match(r'^-?\d+(\.\d+)?$', cleaned))

    @staticmethod
    def format_number(number_str):
        """Format a number string to a standard format."""
        # Remove currency symbols and spaces
        cleaned = re.sub(r'[$€£\s]', '', number_str)
        
        # Handle European number format (1.234,56 -> 1234.56)
        if ',' in cleaned and '.' in cleaned:
            if cleaned.rindex(',') > cleaned.rindex('.'):
                # European format: 1.234,56
                cleaned = cleaned.replace('.', '')
                cleaned = cleaned.replace(',', '.')
            # else: US format: 1,234.56 - just remove commas
            else:
                cleaned = cleaned.replace(',', '')
        elif ',' in cleaned and '.' not in cleaned:
            # Could be European decimal or US thousand separator
            # Assume European if after last comma there are exactly 2 digits
            if re.search(r',\d{2}$', cleaned):
                cleaned = cleaned.replace(',', '.')
            else:
                cleaned = cleaned.replace(',', '')
        
        # Parse and format the number
        try:
            num = float(cleaned)
            # If it's an integer, return it as such
            if num.is_integer():
                return str(int(num))
            else:
                # Format with up to 6 decimal places, removing trailing zeros
                return str(round(num, 6)).rstrip('0').rstrip('.') if '.' in str(round(num, 6)) else str(round(num, 6))
        except ValueError:
            return number_str  # Return original if parsing fails
from io import StringIO
from dateutil import parser
from collections import Counter

class mistral_largeRound0Solution:
    @staticmethod
    def detect_delimiter(csv_data):
        lines = csv_data.splitlines()
        delimiter_candidates = [',', ';', '\t', '|']
        delimiter_counts = Counter()
        for line in lines[:5]:  # Check the first 5 lines
            for delimiter in delimiter_candidates:
                delimiter_counts[delimiter] += line.count(delimiter)
        return delimiter_counts.most_common(1)[0][0]

    @staticmethod
    def normalize_quotes(field):
        if field.startswith(("'", '"')) and field.endswith(("'", '"')):
            return f'"{field[1:-1]}"'
        return f'"{field}"'

    @staticmethod
    def parse_date(date_str):
        try:
            return parser.parse(date_str).strftime('%Y-%m-%d')
        except (ValueError, OverflowError):
            return date_str

    @staticmethod
    def parse_number(number_str):
        number_str = re.sub(r'[^\d.,-]', '', number_str)
        if ',' in number_str and '.' in number_str:
            if number_str.rindex(',') > number_str.rindex('.'):
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                number_str = number_str.replace(',', '')
        return number_str

    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        # Detect delimiter
        delimiter = mistral_largeRound0Solution.detect_delimiter(csv_data)

        # Normalize quotes and handle escaped quotes
        csv_data = re.sub(r"(?<!\\)'", '"', csv_data)
        csv_data = re.sub(r'(?<!\\)"', '"', csv_data)

        # Read CSV data
        reader = csv.reader(StringIO(csv_data), delimiter=delimiter)
        rows = list(reader)

        # Normalize column names
        header = [re.sub(r'\s+', '_', col.lower().strip()) for col in rows[0]]

        # Process rows
        normalized_rows = []
        for row in rows[1:]:
            if all(field.strip() == '' for field in row):
                continue
            normalized_row = []
            for field in row:
                field = field.strip()
                if field == '':
                    normalized_row.append('')
                else:
                    field = mistral_largeRound0Solution.normalize_quotes(field)
                    if re.match(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', field):
                        field = mistral_largeRound0Solution.parse_date(field)
                    elif re.match(r'\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?', field):
                        field = mistral_largeRound0Solution.parse_number(field)
                    normalized_row.append(field)
            normalized_rows.append(normalized_row)

        # Write normalized CSV data
        output = StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
        writer.writerow(header)
        writer.writerows(normalized_rows)

        return output.getvalue()

    @staticmethod
    def solve(input_text):
        return mistral_largeRound0Solution.normalize_csv(input_text)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


class o3_miniRound1Solution:
    @staticmethod
    def solve(input_text):
        return o3_miniRound1Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        The function addresses various common CSV issues:
          • Inconsistent (or mixed) delimiters by detecting the most common delimiter
          • Mixed quote styles – it removes outer single or double quotes
          • Inconsistent date formats – uses dateutil to parse and convert to ISO (YYYY-MM-DD)
          • Inconsistent numeric formats – converts numbers (US and European style) to standard numeric format
          • Trims extra whitespace; replaces missing/null values with empty strings
          • Normalizes header names to lowercase and underscores
          • Handles simple encoding issues and BOM

        Args:
            csv_data: String containing messy CSV data.
        
        Returns:
            A string with cleaned, normalized CSV data.
        """
        # --- 1. Handle encoding issues and BOM removal ---
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected['encoding'] if detected['encoding'] else 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        # Remove any BOM and normalize line endings to '\n'
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split into nonempty lines (stripping leading/trailing whitespace)
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # --- 2. Detect the delimiter ---
        delimiter = o3_miniRound1Solution.detect_delimiter(lines)
        
        # --- 3. Parse CSV using the detected delimiter ---
        reader = csv.reader(io.StringIO("\n".join(lines)), delimiter=delimiter, skipinitialspace=True)
        rows = list(reader)
        if not rows:
            return ""
        
        # --- 4. Normalize header names (assume first row is header) ---
        header = [o3_miniRound1Solution.normalize_column_name(col) for col in rows[0]]
        normalized_rows = [header]
        
        # --- 5. Process each data row ---
        for row in rows[1:]:
            # Skip rows that are completely empty (after stripping)
            if all(not cell.strip() for cell in row):
                continue
            
            # If row field count is not the same as header, adjust:
            row = o3_miniRound1Solution.adjust_row_length(row, len(header))
            
            new_row = []
            for cell in row:
                cleaned = o3_miniRound1Solution.clean_field(cell)
                norm_val = o3_miniRound1Solution.normalize_value(cleaned)
                new_row.append(norm_val)
            normalized_rows.append(new_row)
        
        # --- 6. Write the normalized output to CSV format (always use comma delimiter) ---
        out_io = io.StringIO()
        # Using QUOTE_MINIMAL so that only fields needing quotes are quoted.
        writer = csv.writer(out_io, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        return out_io.getvalue()

    @staticmethod
    def detect_delimiter(lines):
        """
        Detect the most likely delimiter (from a set of common delimiters)
        by counting occurrences outside of quotes. 
        """
        common_delimiters = [',', ';', '\t', '|']
        # For each candidate, count occurrences outside of quoted parts in each line.
        delimiter_counts = {d: [] for d in common_delimiters}
        for line in lines:
            for delim in common_delimiters:
                count = o3_miniRound1Solution.count_delimiters_outside_quotes(line, delim)
                delimiter_counts[delim].append(count)
        # Prefer the delimiter with a consistent nonzero count across lines
        best_delim = ','
        best_score = 0
        for delim, counts in delimiter_counts.items():
            # Only consider lines that actually have nonzero counts
            non_zero = [c for c in counts if c > 0]
            if not non_zero:
                continue
            # Use consistency: most common count
            most_common = Counter(non_zero).most_common(1)[0][1]
            if most_common > best_score:
                best_score = most_common
                best_delim = delim
        return best_delim

    @staticmethod
    def count_delimiters_outside_quotes(line, delimiter):
        """
        Count instances of the delimiter outside of any quoted regions.
        Quotes may be single or double.
        """
        count = 0
        in_quotes = False
        quote_char = None
        for char in line:
            # Toggle the in_quotes flag if we encounter an unescaped quote
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        return count

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """Normalize a column name to lowercase, trim it, 
           replace spaces with underscores and remove extra non-alphanumeric characters."""
        name = name.strip().lower()
        name = re.sub(r'\s+', '_', name)
        name = re.sub(r'[^\w]', '', name)
        return name

    @staticmethod
    def adjust_row_length(row, expected_length):
        """
        Adjust row length to match expected header length.
        If row has too few values, pad with empty strings.
        If too many, try to combine extra splits if they are likely due to erroneous splits
        or else truncate.
        """
        if len(row) == expected_length:
            return row
        elif len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        else:
            # Attempt to combine fields if extra fields might be the result of splitting on a delimiter inside quoted text
            new_row = []
            i = 0
            while i < len(row) and len(new_row) < expected_length - 1:
                field = row[i]
                # If the field starts with a quote but does not end with one, combine with next fields.
                if (field.startswith('"') and not field.endswith('"')) or (field.startswith("'") and not field.endswith("'")):
                    combined = field
                    i += 1
                    while i < len(row) and not row[i].endswith(field[0]):
                        combined += " " + row[i]
                        i += 1
                    if i < len(row):
                        combined += " " + row[i]
                    new_row.append(combined)
                    i += 1
                else:
                    new_row.append(field)
                    i += 1
            # Append any remaining fields to the last column (if any)
            if i < len(row):
                new_row.append(" ".join(row[i:]))
            # Ensure correct length
            if len(new_row) < expected_length:
                new_row += [""] * (expected_length - len(new_row))
            elif len(new_row) > expected_length:
                new_row = new_row[:expected_length]
            return new_row

    @staticmethod
    def clean_field(field: str) -> str:
        """
        Remove extraneous whitespace and remove matching outer quotes 
        (both single and double), then return the cleaned field.
        """
        field = field.strip()
        if len(field) >= 2:
            if (field[0] == field[-1]) and field[0] in ('"', "'"):
                field = field[1:-1].strip()
        return field

    @staticmethod
    def normalize_value(value: str) -> str:
        """
        Normalize an individual field value:
          1. Replace missing/null-like values with empty string.
          2. Normalize booleans ("yes"/"true" => "true", "no"/"false" => "false").
          3. If it looks like a date, try to parse and convert to ISO YYYY-MM-DD.
          4. If it looks like a number, clean up currency and thousand separators.
          5. Otherwise, return the cleaned string.
        """
        # Handle missing/null values.
        if value == "" or value.lower() in {'null', 'none', 'na', 'n/a', '-'}:
            return ""
        
        # Normalize booleans
        low_val = value.lower()
        if low_val in {'yes', 'true', 'y', '1'}:
            return "true"
        if low_val in {'no', 'false', 'n', '0'}:
            return "false"
        
        # Attempt date normalization if there is any alphabetic character or common date separator.
        if re.search(r'[a-zA-Z]', value) or re.search(r'[\-/]', value):
            date_normalized = o3_miniRound1Solution.try_normalize_date(value)
            if date_normalized is not None:
                return date_normalized

        # Attempt to normalize numbers if the text contains digits.
        if re.search(r'\d', value):
            number_normalized = o3_miniRound1Solution.try_normalize_number(value)
            if number_normalized is not None:
                return number_normalized

        return value

    @staticmethod
    def try_normalize_date(text: str) -> str or None:
        """
        Try to parse a string as a date using dateutil's parser (with fuzzy matching)
        and if successful, return the date in ISO format (YYYY-MM-DD). Otherwise, return None.
        """
        try:
            # Remove ordinal suffixes like "3rd", "2nd", etc.
            cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', text)
            dt = date_parser.parse(cleaned, fuzzy=True)
            return dt.strftime('%Y-%m-%d')
        except Exception:
            return None

    @staticmethod
    def try_normalize_number(text: str) -> str or None:
        """
        Try to convert a numeric string that may include various currency symbols,
        thousand separators, or European-style decimals into a standard numeric format.
        Returns the numeric string if successful, else None.
        """
        # Remove common currency symbols and extra whitespace.
        cleaned = re.sub(r'[\$€£]', '', text).strip()
        # First, try assuming commas are thousand separators (US style).
        try:
            num = float(cleaned.replace(',', ''))
            # Return as simple float string (or integer if exact)
            if num.is_integer():
                return str(int(num))
            return str(num)
        except Exception:
            pass

        # Second, try European convention: dots as thousand separators and comma as decimal separator.
        try:
            cleaned2 = cleaned.replace('.', '').replace(',', '.')
            num = float(cleaned2)
            if num.is_integer():
                return str(int(num))
            return str(num)
        except Exception:
            return None

class gpt4oRound1Solution:
    @staticmethod
    def solve(input_text):
        return gpt4oRound1Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Detect and decode character encoding issues
        if isinstance(csv_data, bytes):
            csv_data = csv_data.decode('utf-8', errors='replace')
            
        # Remove BOM if present
        csv_data = csv_data.lstrip('\ufeff')
        
        # Normalize line endings
        csv_data = csv_data.replace('\r\n', '\n').replace('\r', '\n')
        
        # Remove completely empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect delimiter
        delimiter = gpt4oRound1Solution.detect_delimiter(lines)
        
        # Parse CSV with the detected delimiter
        reader = csv.reader(io.StringIO('\n'.join(lines)), delimiter=delimiter, quotechar='"')
        rows = list(reader)
        if not rows:
            return ""
        
        # Normalize header row
        headers = [gpt4oRound1Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        cleaned_rows = [headers]
        for row in rows[1:]:
            # Skip rows that are completely empty
            if all(not cell.strip() for cell in row):
                continue
            
            # Ensure row length matches header length
            row = gpt4oRound1Solution.adjust_row_length(row, len(headers))
            
            cleaned_row = [gpt4oRound1Solution.clean_field(cell, headers[i]) for i, cell in enumerate(row)]
            cleaned_rows.append(cleaned_row)
        
        # Write the cleaned data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(cleaned_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines):
        """Detect the most likely delimiter in the CSV data."""
        delimiter_candidates = [',', ';', '\t', '|']
        delimiter_counts = Counter()

        for line in lines[:5]:  # Analyze the first few lines
            for delimiter in delimiter_candidates:
                delimiter_counts[delimiter] += line.count(delimiter)
        
        # Return the most common delimiter
        return delimiter_counts.most_common(1)[0][0]

    @staticmethod
    def normalize_column_name(name):
        """Normalize column name to lowercase with underscores."""
        name = re.sub(r'[^\w\s]', '_', name)
        name = re.sub(r'\s+', '_', name).lower()
        return name.strip('_')

    @staticmethod
    def adjust_row_length(row, expected_length):
        """Adjust row to match expected length."""
        if len(row) < expected_length:
            return row + [''] * (expected_length - len(row))
        elif len(row) > expected_length:
            return row[:expected_length]
        return row

    @staticmethod
    def clean_field(field, column_name):
        """Clean and normalize a field value based on content and column name."""
        field = field.strip().strip('"').strip("'")
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if gpt4oRound1Solution.looks_like_date_column(column_name):
            try:
                date_obj = date_parser.parse(field, fuzzy=True)
                return date_obj.strftime('%Y-%m-%d')
            except:
                pass

        # Try to detect and normalize numeric fields
        if gpt4oRound1Solution.looks_like_numeric(field):
            return gpt4oRound1Solution.parse_number(field)
        
        return field

    @staticmethod
    def looks_like_date_column(column_name):
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def looks_like_numeric(field):
        """Check if field looks like it contains a number."""
        cleaned = re.sub(r'[^\d.,-]', '', field)
        return bool(re.match(r'^[+-]?\d+([.,]\d+)*$', cleaned))

    @staticmethod
    def parse_number(number_str):
        """Normalize number format."""
        number_str = re.sub(r'[^\d.,-]', '', number_str)
        if ',' in number_str and '.' in number_str:
            if number_str.rindex(',') > number_str.rindex('.'):
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                number_str = number_str.replace(',', '')
        elif ',' in number_str:
            number_str = number_str.replace(',', '.')
        try:
            num = float(number_str)
            return f"{num:.2f}".rstrip('0').rstrip('.') if '.' in str(num) else str(int(num))
        except ValueError:
            return number_str
from typing import Optional

class claude37Round1Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return claude37Round1Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle character encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')
        
        # Remove BOM if present and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split into lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect and standardize delimiters
        delimiter = claude37Round1Solution.detect_delimiter(lines)
        standardized_lines = claude37Round1Solution.standardize_delimiters(lines, delimiter)
        
        # Parse CSV with detected delimiter
        rows = claude37Round1Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows:
            return ""
        
        # Normalize header row
        header = [claude37Round1Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue
            
            # Adjust row length to match header
            adjusted_row = claude37Round1Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue
            
            # Clean each field
            normalized_row = [
                claude37Round1Solution.clean_field(field, header[i] if i < len(header) else f"column_{i}")
                for i, field in enumerate(adjusted_row)
            ]
            
            normalized_rows.append(normalized_row)
        
        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter in CSV data by analyzing patterns outside quoted sections.
        
        Args:
            lines: List of CSV data lines
            
        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}
        
        for line in lines[:min(10, len(lines))]:  # Check first 10 lines or fewer
            for delimiter in common_delimiters:
                count = claude37Round1Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)
        
        # Find most consistent non-zero delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0
        
        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue
            
            # Calculate consistency (how many lines have the same count)
            count_frequencies = Counter(counts)
            most_common_count, frequency = count_frequencies.most_common(1)[0]
            consistency_score = frequency * most_common_count  # Weight by both frequency and count
            
            if consistency_score > best_score:
                best_delimiter = delimiter
                best_score = consistency_score
        
        # Special case: if no clear delimiter is found, check for multiple spaces
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that are outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False
        
        for i, char in enumerate(line):
            # Handle escape sequences
            if escaped:
                escaped = False
                continue
            
            if char == '\\':
                escaped = True
                continue
                
            # Toggle quote state
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            
            # Count delimiter if outside quotes
            elif char == delimiter and not in_quotes:
                count += 1
                
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use the primary delimiter.
        
        Args:
            lines: List of CSV data lines
            primary_delimiter: The delimiter to standardize to
            
        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []
        
        for line in lines:
            # Handle space-delimited lines
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line)
                standardized_lines.append(','.join(fields))
                continue
            
            # For other delimiters, process quotes properly
            new_line = ""
            in_quotes = False
            quote_char = None
            
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char  # Different quote inside quoted text
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char
                    
            standardized_lines.append(new_line)
        
        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV lines into rows, handling mixed quote styles and other issues.
        
        Args:
            lines: List of standardized CSV lines
            detected_delimiter: The primary delimiter used in the data
            
        Returns:
            List of parsed CSV rows
        """
        # Join lines back into a single string
        csv_text = '\n'.join(lines)
        
        # Use the correct delimiter for parsing
        actual_delimiter = ',' if detected_delimiter == r'\s+' else detected_delimiter
        
        try:
            # Try parsing with csv module
            reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delimiter)
            rows = list(reader)
            
            # Post-process to clean up quotes and whitespace
            clean_rows = []
            for row in rows:
                clean_row = []
                for field in row:
                    field = field.strip()
                    # Remove matching outer quotes if present
                    if (field.startswith('"') and field.endswith('"')) or \
                       (field.startswith("'") and field.endswith("'")):
                        field = field[1:-1].strip()
                    clean_row.append(field)
                clean_rows.append(clean_row)
            
            return clean_rows
        except Exception:
            # Fallback: manual parsing
            rows = []
            for line in lines:
                fields = []
                current_field = ""
                in_quotes = False
                quote_char = None
                
                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current_field += char
                    elif char == actual_delimiter and not in_quotes:
                        fields.append(current_field.strip())
                        current_field = ""
                    else:
                        current_field += char
                        
                fields.append(current_field.strip())
                rows.append(fields)
                
            return rows

    @staticmethod
    def normalize_column_name(column: str) -> str:
        """
        Normalize column name to lowercase with underscores.
        
        Args:
            column: The column name to normalize
            
        Returns:
            Normalized column name
        """
        # Remove outer quotes if present
        column = column.strip()
        if (column.startswith('"') and column.endswith('"')) or \
           (column.startswith("'") and column.endswith("'")):
            column = column[1:-1].strip()
        
        # Replace non-alphanumeric with underscores
        normalized = re.sub(r'[^\w\s]', '_', column)
        # Replace whitespace with underscores and convert to lowercase
        normalized = re.sub(r'\s+', '_', normalized).lower()
        # Remove consecutive underscores
        normalized = re.sub(r'_+', '_', normalized)
        # Remove leading/trailing underscores
        normalized = normalized.strip('_')
        
        return normalized or "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> Optional[List[str]]:
        """
        Adjust row to match expected length.
        
        Args:
            row: The row to adjust
            expected_length: The expected number of fields
            
        Returns:
            Adjusted row or None if adjustment is not possible
        """
        if len(row) == expected_length:
            return row
        
        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If row is too long, try to combine fields that might have been incorrectly split
        new_row = []
        i = 0
        while i < len(row):
            field = row[i]
            
            # If we've already reached expected length, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(' '.join(row[i:]))
                break
                
            # Check if this field starts with a quote but doesn't end with one
            if (field.startswith('"') and not field.endswith('"')) or \
               (field.startswith("'") and not field.endswith("'")):
                # Find the matching end quote
                combined = field
                j = i + 1
                found_end = False
                quote_char = field[0]
                
                while j < len(row):
                    combined += "," + row[j]  # Add back the comma that was removed during parsing
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1
                
                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1
        
        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]
        
        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize field value based on content and column name.
        
        Args:
            field: The field value to clean
            column_name: The name of the column this field belongs to
            
        Returns:
            Cleaned field value
        """
        # Trim whitespace and remove outer quotes
        field = field.strip()
        if (field.startswith('"') and field.endswith('"')) or \
           (field.startswith("'") and field.endswith("'")):
            field = field[1:-1].strip()
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if claude37Round1Solution.looks_like_date(field) or claude37Round1Solution.is_date_column(column_name):
            try:
                date_obj = claude37Round1Solution.parse_date(field)
                if date_obj:
                    return date_obj.strftime('%Y-%m-%d')
            except Exception:
                pass  # If date parsing fails, continue with other cleaning
        
        # Try to detect and normalize numeric fields
        if claude37Round1Solution.looks_like_numeric(field):
            try:
                return claude37Round1Solution.format_number(field)
            except Exception:
                pass  # If number parsing fails, return cleaned string
        
        return field

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if field value looks like a date."""
        # Common date patterns
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',    # YYYY-MM-DD
            r'[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{2,4}',  # Month DD, YYYY
            r'\d{1,2}\s+[A-Za-z]{3,9},?\s+\d{2,4}'   # DD Month YYYY
        ]
        
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def parse_date(date_str: str) -> Optional[Any]:
        """
        Parse a date string in various formats.
        
        Args:
            date_str: The date string to parse
            
        Returns:
            Parsed date object or None if parsing fails
        """
        # Remove currency symbols and other non-date characters
        date_str = re.sub(r'[$€£]', '', date_str)
        
        # Handle special formats like "May 3rd, 1992"
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
        
        try:
            return parser.parse(date_str, fuzzy=True)
        except Exception:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """
        Check if field looks like it contains a number.
        
        Args:
            field: The field to check
            
        Returns:
            True if field looks like a number, False otherwise
        """
        # Remove currency symbols, commas, spaces
        cleaned = re.sub(r'[$€£¥]', '', field)
        cleaned = re.sub(r'[,\s]', '', cleaned)
        
        # Check for numeric patterns
        return bool(re.match(r'^-?\d*\.?\d+%?$', cleaned))

    @staticmethod
    def format_number(field: str) -> str:
        """
        Format a numeric field consistently.
        
        Args:
            field: The field to format
            
        Returns:
            Formatted number as string
        """
        # Remove currency symbols, commas, spaces
        cleaned = re.sub(r'[$€£¥]', '', field)
        cleaned = re.sub(r'[,\s]', '', cleaned)
        
        # Handle percentages
        is_percentage = cleaned.endswith('%')
        if is_percentage:
            cleaned = cleaned[:-1]
        
        # Parse as float
        try:
            num = float(cleaned)
            
            # Convert back to percentage if needed
            if is_percentage:
                return f"{num:.2f}%"
                
            # Handle integers vs decimals
            if num.is_integer():
                return str(int(num))
            else:
                # Format with up to 6 decimal places, removing trailing zeros
                formatted = f"{num:.6f}".rstrip('0').rstrip('.')
                return formatted
        except ValueError:
            return field

class mistral_largeRound1Solution:
    @staticmethod
    def detect_delimiter(csv_data):
        lines = csv_data.splitlines()
        delimiter_candidates = [',', ';', '\t', '|']
        delimiter_counts = Counter()
        for line in lines[:5]:  # Check the first 5 lines
            for delimiter in delimiter_candidates:
                delimiter_counts[delimiter] += line.count(delimiter)
        return delimiter_counts.most_common(1)[0][0]

    @staticmethod
    def normalize_quotes(field):
        if field.startswith(("'", '"')) and field.endswith(("'", '"')):
            return f'"{field[1:-1]}"'
        return f'"{field}"'

    @staticmethod
    def parse_date(date_str):
        try:
            return parser.parse(date_str).strftime('%Y-%m-%d')
        except (ValueError, OverflowError):
            return date_str

    @staticmethod
    def parse_number(number_str):
        number_str = re.sub(r'[^\d.,-]', '', number_str)
        if ',' in number_str and '.' in number_str:
            if number_str.rindex(',') > number_str.rindex('.'):
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                number_str = number_str.replace(',', '')
        return number_str

    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        # Handle character encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')

        # Remove BOM if present
        csv_data = csv_data.lstrip('\ufeff')

        # Normalize line endings
        csv_data = csv_data.replace('\r\n', '\n').replace('\r', '\n')

        # Detect delimiter
        delimiter = mistral_largeRound1Solution.detect_delimiter(csv_data)

        # Normalize quotes and handle escaped quotes
        csv_data = re.sub(r"(?<!\\)'", '"', csv_data)
        csv_data = re.sub(r'(?<!\\)"', '"', csv_data)

        # Read CSV data
        reader = csv.reader(io.StringIO(csv_data), delimiter=delimiter)
        rows = list(reader)

        # Normalize column names
        header = [re.sub(r'\s+', '_', col.lower().strip()) for col in rows[0]]

        # Process rows
        normalized_rows = []
        for row in rows[1:]:
            if all(field.strip() == '' for field in row):
                continue
            normalized_row = []
            for field in row:
                field = field.strip()
                if field == '':
                    normalized_row.append('')
                else:
                    field = mistral_largeRound1Solution.normalize_quotes(field)
                    if re.match(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', field):
                        field = mistral_largeRound1Solution.parse_date(field)
                    elif re.match(r'\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?', field):
                        field = mistral_largeRound1Solution.parse_number(field)
                    normalized_row.append(field)
            normalized_rows.append(normalized_row)

        # Write normalized CSV data
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
        writer.writerow(header)
        writer.writerows(normalized_rows)

        return output.getvalue()

    @staticmethod
    def solve(input_text):
        return mistral_largeRound1Solution.normalize_csv(input_text)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


class o3_miniRound2Solution:
    @staticmethod
    def solve(input_text):
        return o3_miniRound2Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        This function handles:
          - Character encoding issues (including BOM removal)
          - Inconsistent line endings
          - Automatic detection of delimiter (among comma, semicolon, tab, pipe)
          - Mixed quote styles (removing outer quotes)
          - Normalizing header names (lowercase with underscores)
          - Adjusting rows if the field count is off
          - Cleaning each field:
              • Trimming extra whitespace/quotes
              • Converting booleans (e.g. "yes"/"no")
              • Converting dates (using dateutil's fuzzy parser into ISO YYYY-MM-DD)
              • Converting numbers (handling US vs European styles)
              • Returning empty string for missing/null-like values
        Returns:
            Clean CSV data as a string with comma-separated values.
        """
        # --- 1. Fix encoding, BOM, and line endings ---
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected.get('encoding') or 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # --- 2. Split into nonempty lines ---
        lines = [line.strip() for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # --- 3. Detect delimiter among common candidates (',', ';', '\t', '|') ---
        delimiter = o3_miniRound2Solution.detect_delimiter(lines)
        
        # --- 4. Parse CSV using the detected delimiter ---
        # Use csv.reader to allow proper handling of quotes (our cleaning will remove extraneous quotes later)
        reader = csv.reader(io.StringIO("\n".join(lines)), delimiter=delimiter, skipinitialspace=True)
        rows = list(reader)
        if not rows:
            return ""
        
        # --- 5. Normalize header row ---
        header = [o3_miniRound2Solution.normalize_column_name(col) for col in rows[0]]
        
        # --- 6. Process each row (skip fully empty rows, adjust row length) ---
        cleaned_rows = [header]  # first row is header
        for row in rows[1:]:
            # Skip a row if all fields are blank after stripping
            if all(not cell.strip() for cell in row):
                continue
            
            row = o3_miniRound2Solution.adjust_row_length(row, len(header))
            
            # Clean each field based on its content (and if needed, by its column name)
            new_row = []
            for idx, field in enumerate(row):
                new_field = o3_miniRound2Solution.clean_field(field)
                new_field = o3_miniRound2Solution.normalize_value(new_field, header[idx] if idx < len(header) else f'column_{idx}')
                new_row.append(new_field)
            cleaned_rows.append(new_row)
        
        # --- 7. Write normalized rows back to CSV (use comma as delimiter) ---
        output = io.StringIO()
        writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(cleaned_rows)
        
        return output.getvalue()


    # ---------------- Helper Functions ----------------

    @staticmethod
    def detect_delimiter(lines):
        """
        Detect the most likely delimiter by counting occurrences outside quoted regions.
        Considers common delimiters: comma, semicolon, tab, and pipe.
        """
        candidates = [',', ';', '\t', '|']
        delim_counts = {d: 0 for d in candidates}
        for line in lines[:5]:  # analyze first few lines
            for d in candidates:
                delim_counts[d] += o3_miniRound2Solution.count_outside_quotes(line, d)
        # Pick the delimiter with the highest total count
        return max(delim_counts, key=delim_counts.get) if max(delim_counts.values()) > 0 else ','

    @staticmethod
    def count_outside_quotes(line, delimiter):
        """
        Count occurrences of a delimiter outside quoted segments.
        Quotes considered are single and double.
        """
        count = 0
        in_quotes = False
        quote_char = None
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        return count

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """
        Normalize a header column name to lowercase with underscores.
        Removes punctuation (except underscores) and extra whitespace.
        """
        name = name.strip().lower()
        # Replace non-word characters with underscore and reduce multiple underscores
        name = re.sub(r'[^\w\s]', '_', name)
        name = re.sub(r'\s+', '_', name)
        name = re.sub(r'_+', '_', name)
        return name.strip('_')

    @staticmethod
    def adjust_row_length(row, expected_length):
        """
        Change the row to have exactly expected_length fields.
          - If too few, pad with empty strings.
          - If too many, combine extra fields onto the last column.
        """
        if len(row) == expected_length:
            return row
        if len(row) < expected_length:
            return row + [''] * (expected_length - len(row))
        # When too many fields, combine extras into the last field.
        new_row = row[:expected_length-1]
        new_row.append(" ".join(row[expected_length-1:]))
        return new_row

    @staticmethod
    def clean_field(field: str) -> str:
        """
        Clean extraneous whitespace and remove matching outer quotes (both single and double).
        """
        field = field.strip()
        if len(field) >= 2 and field[0] == field[-1] and field[0] in ('"', "'"):
            field = field[1:-1].strip()
        return field

    @staticmethod
    def normalize_value(value: str, column_name: str) -> str:
        """
        Normalize an individual cell's value.
          • If blank or null-like, return empty string.
          • Convert booleans to standardized lower-case "true"/"false".
          • If the field looks like a date (or belongs to a date column), try to convert to ISO (YYYY-MM-DD).
          • If it looks like a number, remove currency symbols and thousand separators and produce a standard number.
          • Otherwise, return the cleaned string.
        """
        # Handle missing/null values
        if not value or value.lower() in {"null", "none", "na", "n/a", "-"}:
            return ""
        
        # Normalize boolean values
        low = value.lower()
        if low in {"yes", "true", "y", "1"}:
            return "true"
        if low in {"no", "false", "n", "0"}:
            return "false"
        
        # Try date normalization if value contains alphabetic characters or common date delimiters,
        # or if the column name implies a date (e.g., contains "date", "birth", "updated")
        if (re.search(r'[A-Za-z]', value) or re.search(r'[\-/]', value)) or o3_miniRound2Solution.is_date_column(column_name):
            date_norm = o3_miniRound2Solution.try_normalize_date(value)
            if date_norm is not None:
                return date_norm
        
        # If the field contains digits, attempt number normalization.
        if re.search(r'\d', value):
            num_norm = o3_miniRound2Solution.try_normalize_number(value)
            if num_norm is not None:
                return num_norm
        
        # Otherwise, return the value as-is.
        return value

    @staticmethod
    def is_date_column(col: str) -> bool:
        """Heuristic to see if a column name suggests date values."""
        return any(ind in col.lower() for ind in ['date', 'day', 'month', 'year', 'time', 'birth', 'updated', 'created'])

    @staticmethod
    def try_normalize_date(text: str):
        """
        Attempt to parse text as a date (using fuzzy matching).
        Removes ordinal suffixes (e.g. "3rd" becomes "3") before parsing.
        Returns date formatted as YYYY-MM-DD if successful; otherwise, returns None.
        """
        try:
            cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', text)
            dt = date_parser.parse(cleaned, fuzzy=True)
            return dt.strftime('%Y-%m-%d')
        except Exception:
            return None

    @staticmethod
    def try_normalize_number(text: str):
        """
        Attempt to normalize a number:
          - Remove common currency symbols [$€£] and whitespace.
          - Handle both US style numbers (with comma as thousand separators)
            and European style (dot as thousand separator and comma as decimal separator).
        Returns the standard number string if parsed successfully; otherwise, None.
        """
        cleaned = re.sub(r'[\s$€£]', '', text)
        # If both comma and dot exist, decide by the rightmost symbol
        if ',' in cleaned and '.' in cleaned:
            if cleaned.rfind(',') > cleaned.rfind('.'):
                # Likely European style: thousand separator is dot, decimal is comma.
                cleaned = cleaned.replace('.', '').replace(',', '.')
            else:
                # Likely US style: remove commas.
                cleaned = cleaned.replace(',', '')
        elif ',' in cleaned and '.' not in cleaned:
            # Ambiguous: if comma appears and if there is only one occurrence at the appropriate position,
            # try European-style conversion.
            if cleaned.count(',') == 1 and len(cleaned.split(',')[-1]) in {1,2}:
                cleaned = cleaned.replace(',', '.')
            else:
                cleaned = cleaned.replace(',', '')
                
        try:
            num = float(cleaned)
            # Return as integer if no fractional part, otherwise as float string
            if num.is_integer():
                return str(int(num))
            return str(num)
        except Exception:
            return None
from typing import List, Optional

class gpt4oRound2Solution:
    @staticmethod
    def solve(input_text):
        return gpt4oRound2Solution.normalize_csv(input_text)

    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        # Handle encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')
        
        # Remove BOM and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect delimiter and normalize lines
        delimiter = gpt4oRound2Solution.detect_delimiter(lines)
        standardized_lines = gpt4oRound2Solution.standardize_delimiters(lines, delimiter)
        
        # Parse CSV with detected delimiter
        rows = gpt4oRound2Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows:
            return ""
        
        # Normalize header
        header = [gpt4oRound2Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            if not row or all(not cell.strip() for cell in row):
                continue
            adjusted_row = gpt4oRound2Solution.adjust_row_length(row, len(header))
            normalized_row = [gpt4oRound2Solution.clean_field(cell, header[i] if i < len(header) else f"column_{i}") for i, cell in enumerate(adjusted_row)]
            normalized_rows.append(normalized_row)
        
        # Write to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts = {d: [] for d in common_delimiters}
        
        for line in lines[:10]:
            for delim in common_delimiters:
                count = gpt4oRound2Solution.count_delimiters_outside_quotes(line, delim)
                delimiter_counts[delim].append(count)
        
        best_delimiter = max(delimiter_counts, key=lambda d: sum(delimiter_counts[d]))
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        count = 0
        in_quotes = False
        quote_char = None
        
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        standardized_lines = []
        
        for line in lines:
            new_line = ""
            in_quotes = False
            quote_char = None
            
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char
            standardized_lines.append(new_line)
        
        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        csv_text = '\n'.join(lines)
        try:
            reader = csv.reader(io.StringIO(csv_text), delimiter=',')
            return [row for row in reader]
        except Exception:
            return []

    @staticmethod
    def normalize_column_name(column: str) -> str:
        column = column.strip().lower()
        column = re.sub(r'[^\w\s]', '_', column)
        column = re.sub(r'\s+', '_', column)
        return column.strip('_')

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        elif len(row) > expected_length:
            return row[:expected_length]
        return row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        field = field.strip()
        if len(field) >= 2 and ((field.startswith('"') and field.endswith('"')) or (field.startswith("'") and field.endswith("'"))):
            field = field[1:-1].strip()
        
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        if gpt4oRound2Solution.looks_like_date(field) or gpt4oRound2Solution.is_date_column(column_name):
            date_obj = gpt4oRound2Solution.parse_date(field)
            if date_obj:
                return date_obj.strftime('%Y-%m-%d')
        
        if gpt4oRound2Solution.looks_like_numeric(field):
            return gpt4oRound2Solution.format_number(field)
        
        return field

    @staticmethod
    def looks_like_date(field: str) -> bool:
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
            r'[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{2,4}',
            r'\d{1,2}\s+[A-Za-z]{3,9},?\s+\d{2,4}'
        ]
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def parse_date(date_str: str) -> Optional[date_parser.ParserError]:
        try:
            return date_parser.parse(date_str, fuzzy=True)
        except (date_parser.ParserError, ValueError):
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        cleaned = re.sub(r'[^\d.,-]', '', field)
        return bool(re.match(r'^[+-]?\d+([.,]\d+)*$', cleaned))

    @staticmethod
    def format_number(number_str: str) -> str:
        number_str = re.sub(r'[^\d.,-]', '', number_str)
        if ',' in number_str and '.' in number_str:
            if number_str.rindex(',') > number_str.rindex('.'):
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                number_str = number_str.replace(',', '')
        elif ',' in number_str:
            number_str = number_str.replace(',', '.')
        try:
            num = float(number_str)
            return f"{num:.2f}".rstrip('0').rstrip('.') if '.' in str(num) else str(int(num))
        except ValueError:
            return number_str
from dateutil import parser as date_parser
from typing import List, Optional, Any

class claude37Round2Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return claude37Round2Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # --- 1. Handle encoding issues ---
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected['encoding'] if detected['encoding'] else 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        
        # Remove BOM if present and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split into non-empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # --- 2. Detect delimiter ---
        delimiter = claude37Round2Solution.detect_delimiter(lines)
        
        # --- 3. Parse CSV data ---
        rows = claude37Round2Solution.parse_csv_with_delimiter(lines, delimiter)
        if not rows or len(rows) < 1:
            return ""
        
        # --- 4. Normalize header ---
        header = [claude37Round2Solution.normalize_column_name(col) for col in rows[0]]
        
        # --- 5. Process data rows ---
        cleaned_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if all(not cell.strip() for cell in row):
                continue
            
            # Adjust row length to match header length
            row = claude37Round2Solution.adjust_row_length(row, len(header))
            
            # Clean each field based on content and column name
            cleaned_row = []
            for i, field in enumerate(row):
                column_name = header[i] if i < len(header) else ""
                cleaned_row.append(claude37Round2Solution.clean_field(field, column_name))
            
            cleaned_rows.append(cleaned_row)
        
        # --- 6. Write normalized data back to CSV ---
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(cleaned_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter in CSV data by analyzing patterns outside quoted sections.
        
        Args:
            lines: List of CSV data lines
            
        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts = {d: [] for d in common_delimiters}
        
        # Analyze first few lines
        for line in lines[:min(10, len(lines))]:
            for delimiter in common_delimiters:
                # Count delimiters outside quotes
                count = claude37Round2Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)
        
        # Find most consistent delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0
        
        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue
            
            # Score based on consistency and frequency
            counter = Counter(counts)
            most_common, frequency = counter.most_common(1)[0]
            score = frequency * most_common
            
            if score > best_score:
                best_score = score
                best_delimiter = delimiter
        
        # Special case: check for multiple spaces as delimiter
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that appear outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def parse_csv_with_delimiter(lines: List[str], delimiter: str) -> List[List[str]]:
        """
        Parse CSV data using the detected delimiter.
        
        Args:
            lines: List of CSV data lines
            delimiter: The detected delimiter
            
        Returns:
            List of parsed rows
        """
        # Handle space delimiter specially
        if delimiter == r'\s+':
            rows = []
            for line in lines:
                # Split by multiple spaces while preserving quoted content
                fields = []
                current = ""
                in_quotes = False
                quote_char = None
                
                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        current += char
                    elif char.isspace() and not in_quotes and current:
                        if re.match(r'\s{2,}', char):
                            fields.append(current)
                            current = ""
                        else:
                            current += char
                    else:
                        current += char
                
                if current:
                    fields.append(current)
                rows.append(fields)
            return rows
        
        # For standard delimiters, use csv module
        try:
            csv_text = '\n'.join(lines)
            reader = csv.reader(io.StringIO(csv_text), delimiter=delimiter)
            return list(reader)
        except Exception:
            # Fallback to manual parsing if csv module fails
            rows = []
            for line in lines:
                fields = []
                current = ""
                in_quotes = False
                quote_char = None
                
                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current += char
                    elif char == delimiter and not in_quotes:
                        fields.append(current)
                        current = ""
                    else:
                        current += char
                
                fields.append(current)
                rows.append(fields)
            
            return rows

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """
        Normalize column name to lowercase with underscores.
        
        Args:
            name: Column name to normalize
            
        Returns:
            Normalized column name
        """
        # Remove quotes and extra whitespace
        name = name.strip()
        if (name.startswith('"') and name.endswith('"')) or (name.startswith("'") and name.endswith("'")):
            name = name[1:-1].strip()
        
        # Convert to lowercase and replace spaces with underscores
        name = re.sub(r'\s+', '_', name.lower())
        
        # Replace non-alphanumeric characters with underscores
        name = re.sub(r'[^\w]', '_', name)
        
        # Remove consecutive underscores
        name = re.sub(r'_+', '_', name)
        
        # Remove leading/trailing underscores
        name = name.strip('_')
        
        return name if name else "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        """
        Adjust row to match expected length.
        
        Args:
            row: Row to adjust
            expected_length: Expected number of fields
            
        Returns:
            Adjusted row
        """
        if len(row) == expected_length:
            return row
        
        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If row is too long, try to intelligently combine fields
        new_row = []
        i = 0
        
        while i < len(row):
            # If we've reached our target length minus 1, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(" ".join(row[i:]))
                break
            
            field = row[i]
            
            # Check if this field starts with a quote but doesn't end with one
            if ((field.startswith('"') and not field.endswith('"')) or 
                (field.startswith("'") and not field.endswith("'"))):
                # Find matching end quote
                combined = field
                j = i + 1
                quote_char = field[0]
                found_end = False
                
                while j < len(row):
                    combined += "," + row[j]  # Add back the delimiter
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1
                
                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1
        
        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]
        
        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str = "") -> str:
        """
        Clean and normalize a field value.
        
        Args:
            field: Field value to clean
            column_name: Name of the column (for type inference)
            
        Returns:
            Cleaned field value
        """
        # Remove outer quotes and trim whitespace
        field = field.strip()
        if len(field) >= 2:
            if (field[0] == field[-1]) and field[0] in ['"', "'"]:
                field = field[1:-1].strip()
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if claude37Round2Solution.looks_like_date_column(column_name) or claude37Round2Solution.looks_like_date(field):
            try:
                date_obj = claude37Round2Solution.normalize_date(field)
                if date_obj:
                    return date_obj
            except Exception:
                pass  # Fall through to other normalizations if date parsing fails
        
        # Try to detect and normalize numeric fields
        if claude37Round2Solution.looks_like_numeric(field):
            try:
                return claude37Round2Solution.normalize_number(field)
            except Exception:
                pass  # Return original if number parsing fails
        
        return field

    @staticmethod
    def looks_like_date_column(column_name: str) -> bool:
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if field value looks like a date."""
        # Common date patterns
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',    # YYYY-MM-DD
            r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # Month DD, YYYY
            r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'   # DD Month YYYY
        ]
        
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def normalize_date(date_str: str) -> str:
        """
        Normalize date to ISO format (YYYY-MM-DD).
        
        Args:
            date_str: Date string to normalize
            
        Returns:
            Normalized date string
        """
        # Remove currency symbols and other non-date characters
        date_str = re.sub(r'[$€£]', '', date_str)
        
        # Handle ordinal suffixes (1st, 2nd, 3rd, 4th)
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
        
        try:
            date_obj = date_parser.parse(date_str, fuzzy=True)
            return date_obj.strftime('%Y-%m-%d')
        except Exception:
            return date_str  # Return original if parsing fails

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if field looks like it contains a number."""
        # Remove currency symbols, commas, dots, spaces
        cleaned = re.sub(r'[$€£\s,.]', '', field)
        # Check if the remaining string is a number
        return bool(re.match(r'^[+-]?\d+$', cleaned))

    @staticmethod
    def normalize_number(number_str: str) -> str:
        """
        Normalize numeric format.
        
        Args:
            number_str: Number string to normalize
            
        Returns:
            Normalized number string
        """
        # Remove currency symbols and whitespace
        number_str = re.sub(r'[$€£\s]', '', number_str)
        
        # Handle different number formats
        if ',' in number_str and '.' in number_str:
            # Determine which is the decimal separator
            if number_str.rindex(',') > number_str.rindex('.'):
                # European format: 1.234,56
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                # US format: 1,234.56
                number_str = number_str.replace(',', '')
        elif ',' in number_str:
            # Could be either 1,234 (US) or 1,23 (European)
            parts = number_str.split(',')
            if len(parts) == 2 and len(parts[1]) <= 2:
                # Likely European decimal separator
                number_str = number_str.replace(',', '.')
            else:
                # Likely US thousands separator
                number_str = number_str.replace(',', '')
        
        try:
            # Convert to float and format appropriately
            num = float(number_str)
            if num.is_integer():
                return str(int(num))
            else:
                # Return with appropriate decimal places
                return str(num)
        except ValueError:
            return number_str  # Return original if conversion fails
from typing import List, Optional, Any

class mistral_largeRound2Solution:
    @staticmethod
    def solve(input_text):
        return mistral_largeRound2Solution.normalize_csv(input_text)
        
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.

        Args:
            csv_data: String containing messy CSV data

        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle character encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')

        # Remove BOM if present and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')

        # Split into lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""

        # Detect and standardize delimiters
        delimiter = mistral_largeRound2Solution.detect_delimiter(lines)
        standardized_lines = mistral_largeRound2Solution.standardize_delimiters(lines, delimiter)

        # Parse CSV with detected delimiter
        rows = mistral_largeRound2Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows:
            return ""

        # Normalize header row
        header = [mistral_largeRound2Solution.normalize_column_name(col) for col in rows[0]]

        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue

            # Adjust row length to match header
            adjusted_row = mistral_largeRound2Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue

            # Clean each field
            normalized_row = [
                mistral_largeRound2Solution.clean_field(field, header[i] if i < len(header) else f"column_{i}")
                for i, field in enumerate(adjusted_row)
            ]

            normalized_rows.append(normalized_row)

        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)

        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter in CSV data by analyzing patterns outside quoted sections.

        Args:
            lines: List of CSV data lines

        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}

        for line in lines[:min(10, len(lines))]:  # Check first 10 lines or fewer
            for delimiter in common_delimiters:
                count = mistral_largeRound2Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)

        # Find most consistent non-zero delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0

        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue

            # Calculate consistency (how many lines have the same count)
            count_frequencies = Counter(counts)
            most_common_count, frequency = count_frequencies.most_common(1)[0]
            consistency_score = frequency * most_common_count  # Weight by both frequency and count

            if consistency_score > best_score:
                best_delimiter = delimiter
                best_score = consistency_score

        # Special case: if no clear delimiter is found, check for multiple spaces
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'

        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that are outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False

        for i, char in enumerate(line):
            # Handle escape sequences
            if escaped:
                escaped = False
                continue

            if char == '\\':
                escaped = True
                continue

            # Toggle quote state
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None

            # Count delimiter if outside quotes
            elif char == delimiter and not in_quotes:
                count += 1

        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use the primary delimiter.

        Args:
            lines: List of CSV data lines
            primary_delimiter: The delimiter to standardize to

        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []

        for line in lines:
            # Handle space-delimited lines
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line)
                standardized_lines.append(','.join(fields))
                continue

            # For other delimiters, process quotes properly
            new_line = ""
            in_quotes = False
            quote_char = None

            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char  # Different quote inside quoted text
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char

            standardized_lines.append(new_line)

        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV lines into rows, handling mixed quote styles and other issues.

        Args:
            lines: List of standardized CSV lines
            detected_delimiter: The primary delimiter used in the data

        Returns:
            List of parsed CSV rows
        """
        # Join lines back into a single string
        csv_text = '\n'.join(lines)

        # Use the correct delimiter for parsing
        actual_delimiter = ',' if detected_delimiter == r'\s+' else detected_delimiter

        try:
            # Try parsing with csv module
            reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delimiter)
            rows = list(reader)

            # Post-process to clean up quotes and whitespace
            clean_rows = []
            for row in rows:
                clean_row = []
                for field in row:
                    field = field.strip()
                    # Remove matching outer quotes if present
                    if (field.startswith('"') and field.endswith('"')) or \
                       (field.startswith("'") and field.endswith("'")):
                        field = field[1:-1].strip()
                    clean_row.append(field)
                clean_rows.append(clean_row)

            return clean_rows
        except Exception:
            # Fallback: manual parsing
            rows = []
            for line in lines:
                fields = []
                current_field = ""
                in_quotes = False
                quote_char = None

                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current_field += char
                    elif char == actual_delimiter and not in_quotes:
                        fields.append(current_field.strip())
                        current_field = ""
                    else:
                        current_field += char

                fields.append(current_field.strip())
                rows.append(fields)

            return rows

    @staticmethod
    def normalize_column_name(column: str) -> str:
        """
        Normalize column name to lowercase with underscores.

        Args:
            column: The column name to normalize

        Returns:
            Normalized column name
        """
        # Remove outer quotes if present
        column = column.strip()
        if (column.startswith('"') and column.endswith('"')) or \
           (column.startswith("'") and column.endswith("'")):
            column = column[1:-1].strip()

        # Replace non-alphanumeric with underscores
        normalized = re.sub(r'[^\w\s]', '_', column)
        # Replace whitespace with underscores and convert to lowercase
        normalized = re.sub(r'\s+', '_', normalized).lower()
        # Remove consecutive underscores
        normalized = re.sub(r'_+', '_', normalized)
        # Remove leading/trailing underscores
        normalized = normalized.strip('_')

        return normalized or "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> Optional[List[str]]:
        """
        Adjust row to match expected length.

        Args:
            row: The row to adjust
            expected_length: The expected number of fields

        Returns:
            Adjusted row or None if adjustment is not possible
        """
        if len(row) == expected_length:
            return row

        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))

        # If row is too long, try to combine fields that might have been incorrectly split
        new_row = []
        i = 0
        while i < len(row):
            field = row[i]

            # If we've already reached expected length, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(' '.join(row[i:]))
                break

            # Check if this field starts with a quote but doesn't end with one
            if (field.startswith('"') and not field.endswith('"')) or \
               (field.startswith("'") and not field.endswith("'")):
                # Find the matching end quote
                combined = field
                j = i + 1
                found_end = False
                quote_char = field[0]

                while j < len(row):
                    combined += "," + row[j]  # Add back the comma that was removed during parsing
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1

                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1

        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]

        return new_row

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if a field looks like a date."""
        # Common date patterns
        date_patterns = [
            r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY, etc.
            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',    # YYYY-MM-DD, YYYY/MM/DD
            r'\w+\s+\d{1,2},?\s+\d{4}',        # Month DD, YYYY
            r'\d{1,2}\s+\w+\s+\d{4}'           # DD Month YYYY
        ]
        
        return any(re.match(pattern, field) for pattern in date_patterns)

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """Check if a column name indicates it contains date values."""
        date_indicators = ['date', 'time', 'day', 'month', 'year', 'born', 'created', 'modified', 'updated']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def parse_date(date_str: str):
        """Try to parse a date string into a date object."""
        try:
            return parser.parse(date_str, fuzzy=True)
        except:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if a field looks like a numeric value."""
        # Remove thousand separators and other formatting
        cleaned = re.sub(r'[,$%]', '', field.strip())
        # Match common numeric patterns
        return re.match(r'^-?\d+(\.\d+)?$', cleaned) is not None

    @staticmethod
    def format_number(field: str) -> str:
        """Format a numeric field consistently."""
        # Remove any non-numeric characters except decimal points and negative signs
        cleaned = re.sub(r'[^-0-9.]', '', field.strip())
        # Handle integer vs. float
        if '.' in cleaned:
            try:
                return str(float(cleaned))
            except ValueError:
                return cleaned
        else:
            try:
                return str(int(cleaned))
            except ValueError:
                return cleaned

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize field value based on content and column name.

        Args:
            field: The field value to clean
            column_name: The name of the column this field belongs to

        Returns:
            Cleaned field value
        """
        # Trim whitespace and remove outer quotes
        field = field.strip()
        if (field.startswith('"') and field.endswith('"')) or \
           (field.startswith("'") and field.endswith("'")):
            field = field[1:-1].strip()

        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""

        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"

        # Try to detect and normalize date fields
        if mistral_largeRound2Solution.looks_like_date(field) or mistral_largeRound2Solution.is_date_column(column_name):
            try:
                date_obj = mistral_largeRound2Solution.parse_date(field)
                if date_obj:
                    return date_obj.strftime('%Y-%m-%d')
            except Exception:
                pass  # If date parsing fails, continue with other cleaning

        # Try to detect and normalize numeric fields
        if mistral_largeRound2Solution.looks_like_numeric(field):
            try:
                return mistral_largeRound2Solution.format_number(field)
            except Exception:
                pass  # If number parsing fails, return cleaned string

        return field
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from dateutil import parser as date_parser
from typing import List, Optional

class o3_miniRound3Solution:
    @staticmethod
    def solve(input_text):
        return o3_miniRound3Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Issues addressed:
          - Encoding problems (including BOM)
          - Inconsistent line endings
          - Automatic detection of delimiter among comma, semicolon, tab, pipe
          - Mixed quoting styles: converts to double quotes when needed
          - Normalizes header names (to lowercase with underscores)
          - Adjusts row-lengths when rows have too few/many fields
          - Trims whitespace and strips extraneous quotes from fields
          - Normalizes dates (to ISO format YYYY-MM-DD)
          - Normalizes numbers (removing currency symbols and thousand separators)
          - Standardizes booleans and null-like values
          
        Args:
            csv_data: Input CSV data (string or bytes)
            
        Returns:
            Clean, normalized CSV data as a string
        """
        # 1. Fix encoding issues (if bytes) and remove BOM; normalize line endings.
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected.get('encoding') or 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # 2. Split into nonempty lines.
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # 3. Detect the most likely delimiter.
        delimiter = o3_miniRound3Solution.detect_delimiter(lines)
        
        # 4. Convert all lines to a standardized format:
        #    • Replace any common delimiter by comma.
        #    • Normalize quotes to double quotes.
        standardized_lines = o3_miniRound3Solution.standardize_delimiters(lines, delimiter)
        
        # 5. Parse CSV rows using the csv module.
        rows = o3_miniRound3Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows or len(rows) < 1:
            return ""
        
        # 6. Normalize header names.
        header = [o3_miniRound3Solution.normalize_column_name(col) for col in rows[0]]
        
        # 7. Process and clean each subsequent row.
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip rows that are entirely empty.
            if not row or all(not cell.strip() for cell in row):
                continue
            
            # Adjust row length to match header columns.
            row = o3_miniRound3Solution.adjust_row_length(row, len(header))
            
            # Clean each field according to its content as well as column name hints.
            cleaned_row = []
            for i, field in enumerate(row):
                col_name = header[i] if i < len(header) else f"column_{i}"
                cleaned_field = o3_miniRound3Solution.clean_field(field, col_name)
                cleaned_row.append(cleaned_field)
            normalized_rows.append(cleaned_row)
        
        # 8. Write normalized rows into CSV using the csv writer.
        output = io.StringIO()
        writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter based on the first few lines.
        Counts occurrences of candidate delimiters outside quotes.
        Candidates: comma, semicolon, tab, pipe.
        
        Returns:
            The candidate delimiter with the best (most consistent) score.
        """
        candidates = [',', ';', '\t', '|']
        delim_counts: Dict[str, List[int]] = {d: [] for d in candidates}
        
        for line in lines[:min(10, len(lines))]:
            for d in candidates:
                count = o3_miniRound3Solution.count_delimiters_outside_quotes(line, d)
                if count > 0:
                    delim_counts[d].append(count)
        
        best_delim = ','
        best_score = 0
        for d, counts in delim_counts.items():
            if not counts:
                continue
            counter = Counter(counts)
            common_count, frequency = counter.most_common(1)[0]
            score = frequency * common_count  # weight consistency and frequency
            if score > best_score:
                best_score = score
                best_delim = d

        # Special-case: if no delimiter got found use multiple spaces.
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        return best_delim

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """
        Count delimiter characters that occur outside quoted parts.
        Supports both single and double quotes.
        """
        count = 0
        in_quotes = False
        quote_char = None
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Convert all common delimiters in the lines to a standard comma.
        Also convert any mixed quotes to double quotes.
        
        Args:
            lines: List of CSV lines.
            primary_delimiter: Detected delimiter.
            
        Returns:
            A list of modified lines.
        """
        standardized = []
        
        for line in lines:
            # special handling for space-delimited if we detected multiple spaces
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                # Split by at least 2 spaces.
                fields = re.split(r'\s{2,}', line.strip())
                standardized.append(",".join(fields))
                continue
                
            new_line = ""
            in_quotes = False
            quote_char = None
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # open with double quote
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # close with double quote
                    else:
                        new_line += char
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','
                else:
                    new_line += char
            standardized.append(new_line)
        
        return standardized

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV rows from standardized lines.
        Uses csv.reader to properly handle quotes.
        
        Args:
            lines: List of standardized CSV lines.
            detected_delimiter: Original detected delimiter (used if needed).
            
        Returns:
            List of rows with each row as a list of strings.
        """
        # If using space delimiter (r'\s+'), we already split fields.
        actual_delimiter = ',' if detected_delimiter == r'\s+' else detected_delimiter
        csv_text = "\n".join(lines)
        try:
            reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delimiter)
            rows = [ [cell.strip() for cell in row] for row in reader ]
            return rows
        except Exception:
            # Fallback to manual splitting:
            rows = []
            for line in lines:
                row = line.split(actual_delimiter)
                rows.append([cell.strip() for cell in row])
            return rows

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """
        Normalize header column name to lowercase with underscores.
        Also strips any surrounding quotes and extra whitespace.
        """
        name = name.strip()
        if (name.startswith('"') and name.endswith('"')) or (name.startswith("'") and name.endswith("'")):
            name = name[1:-1].strip()
        # Replace any non-alphanumeric character with underscore and collapse multiple underscores.
        name = re.sub(r'[^\w\s]', '_', name)
        name = re.sub(r'\s+', '_', name)
        name = re.sub(r'_+', '_', name)
        return name.strip('_').lower() or "column"

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        """
        Adjust the row so that it has exactly expected_length fields.
          - If too few, pad with empty strings.
          - If too many, try to merge extra fields into the last cell.
        """
        if len(row) == expected_length:
            return row
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If too many fields, attempt to combine extras in the last column.
        # This simple approach concatenates all extra fields.
        new_row = row[:expected_length-1]
        combined = " ".join(row[expected_length-1:])
        new_row.append(combined)
        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize a single field.
          • Remove extraneous whitespace and matching outer quotes.
          • Replace null-like values with empty string.
          • Convert boolean-like values to "true"/"false".
          • If the value looks like a date (or column name suggests it), convert to ISO (YYYY-MM-DD).
          • If numeric, normalize number format.
        
        Args:
            field: Field content.
            column_name: Name of the column (to help infer type).
            
        Returns:
            Cleaned field.
        """
        field = field.strip()
        if len(field) >= 2 and field[0] == field[-1] and field[0] in ['"', "'"]:
            field = field[1:-1].strip()
        
        # Handle missing/null values.
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize booleans.
        low = field.lower()
        if low in ['true', 'yes', 'y', '1']:
            return "true"
        if low in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try converting dates: if the field looks like a date
        if o3_miniRound3Solution.looks_like_date(field) or o3_miniRound3Solution.is_date_column(column_name):
            date_obj = o3_miniRound3Solution.parse_date(field)
            if date_obj:
                return date_obj.strftime('%Y-%m-%d')
        
        # Try converting numbers if field contains digits.
        if o3_miniRound3Solution.looks_like_numeric(field):
            num = o3_miniRound3Solution.format_number(field)
            if num is not None:
                return num
        
        return field

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """
        Heuristic to decide if a field value resembles a date.
        Checks for common date patterns.
        """
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',     # e.g., 04/25/1991 or 25-12-2023
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',         # e.g., 1991-04-25
            r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # e.g., May 3rd, 1992
            r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'     # e.g., 3rd May 1992
        ]
        return any(re.search(p, field) for p in date_patterns)

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """
        Check if the column name suggests it contains dates.
        """
        indicators = ['date', 'day', 'month', 'year', 'time', 'birth', 'updated', 'created']
        return any(ind in column_name.lower() for ind in indicators)

    @staticmethod
    def parse_date(date_str: str) -> Optional[date_parser]:
        """
        Attempt to parse a date from a string, removing ordinal suffixes if present.
        
        Returns:
            A datetime object if successful, otherwise None.
        """
        try:
            # Remove ordinal suffixes like "3rd" -> "3"
            cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
            dt = date_parser.parse(cleaned, fuzzy=True)
            return dt
        except Exception:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """
        Check if the field appears to represent a number.
        Looks for digits and characters like comma or period.
        """
        # Remove any currency symbols for the check
        cleaned = re.sub(r'[$€£]', '', field)
        return any(ch.isdigit() for ch in cleaned)

    @staticmethod
    def format_number(number_str: str) -> Optional[str]:
        """
        Normalize a number into a standard format.
        Removes currency symbols, thousands separators and converts decimals.
        
        Handles US style (e.g. 1,234.56) and European style (e.g. 1.234,56).
        
        Returns:
            A standardized number string (integer if no fractional part) or None on failure.
        """
        # Remove spaces and currency symbols.
        temp = re.sub(r'[\s$€£]', '', number_str)
        
        # If both comma and dot exist, decide which is the decimal separator.
        if ',' in temp and '.' in temp:
            if temp.rfind(',') > temp.rfind('.'):
                # Likely European style: thousands separator is dot; replace dot by empty and comma by dot.
                temp = temp.replace('.', '')
                temp = temp.replace(',', '.')
            else:
                # Likely US style: remove commas.
                temp = temp.replace(',', '')
        elif ',' in temp and '.' not in temp:
            # Ambiguous: if comma appears only once and last part is two digits, treat as decimal.
            parts = temp.split(',')
            if len(parts) == 2 and len(parts[1]) in {1,2}:
                temp = temp.replace(',', '.')
            else:
                temp = temp.replace(',', '')
        
        try:
            num = float(temp)
            # Return integer string if no fraction, else a normalized float (dropping unneeded trailing zeros)
            if num.is_integer():
                return str(int(num))
            else:
                s = f"{num:.6f}".rstrip('0').rstrip('.')
                return s
        except ValueError:
            return None
from dateutil import parser as date_parser
from typing import List, Optional

class gpt4oRound3Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return gpt4oRound3Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')
        
        # Remove BOM and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect and standardize delimiters
        delimiter = gpt4oRound3Solution.detect_delimiter(lines)
        standardized_lines = gpt4oRound3Solution.standardize_delimiters(lines, delimiter)
        
        # Parse CSV with detected delimiter
        rows = gpt4oRound3Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows:
            return ""
        
        # Normalize the header
        header = [gpt4oRound3Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            if not row or all(not cell.strip() for cell in row):
                continue
            adjusted_row = gpt4oRound3Solution.adjust_row_length(row, len(header))
            normalized_row = [gpt4oRound3Solution.clean_field(cell, header[i] if i < len(header) else f"column_{i}") for i, cell in enumerate(adjusted_row)]
            normalized_rows.append(normalized_row)
        
        # Write to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts = {d: [] for d in common_delimiters}
        
        for line in lines[:min(10, len(lines))]:
            for delim in common_delimiters:
                count = gpt4oRound3Solution.count_delimiters_outside_quotes(line, delim)
                delimiter_counts[delim].append(count)
        
        best_delimiter = max(delimiter_counts, key=lambda d: Counter(delimiter_counts[d]).most_common(1)[0][1])
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        count = 0
        in_quotes = False
        quote_char = None
        
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        standardized_lines = []
        
        for line in lines:
            new_line = ""
            in_quotes = False
            quote_char = None
            
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'
                    else:
                        new_line += char
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','
                else:
                    new_line += char
                    
            standardized_lines.append(new_line)
        
        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        csv_text = '\n'.join(lines)
        try:
            reader = csv.reader(io.StringIO(csv_text), delimiter=',')
            return [row for row in reader]
        except Exception:
            return []

    @staticmethod
    def normalize_column_name(column: str) -> str:
        column = column.strip().lower()
        column = re.sub(r'[^\w\s]', '_', column)
        column = re.sub(r'\s+', '_', column)
        return column.strip('_')

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        elif len(row) > expected_length:
            return row[:expected_length]
        return row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        field = field.strip()
        if len(field) >= 2 and ((field.startswith('"') and field.endswith('"')) or (field.startswith("'") and field.endswith("'"))):
            field = field[1:-1].strip()
        
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        if gpt4oRound3Solution.looks_like_date(field) or gpt4oRound3Solution.is_date_column(column_name):
            date_obj = gpt4oRound3Solution.parse_date(field)
            if date_obj:
                return date_obj.strftime('%Y-%m-%d')
        
        if gpt4oRound3Solution.looks_like_numeric(field):
            return gpt4oRound3Solution.format_number(field)
        
        return field

    @staticmethod
    def looks_like_date(field: str) -> bool:
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
            r'[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{2,4}',
            r'\d{1,2}\s+[A-Za-z]{3,9},?\s+\d{2,4}'
        ]
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def parse_date(date_str: str) -> Optional[date_parser.parser]:
        try:
            return date_parser.parse(date_str, fuzzy=True)
        except (date_parser.ParserError, ValueError):
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        cleaned = re.sub(r'[^\d.,-]', '', field)
        return bool(re.match(r'^[+-]?\d+([.,]\d+)*$', cleaned))

    @staticmethod
    def format_number(number_str: str) -> str:
        number_str = re.sub(r'[^\d.,-]', '', number_str)
        if ',' in number_str and '.' in number_str:
            if number_str.rindex(',') > number_str.rindex('.'):
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                number_str = number_str.replace(',', '')
        elif ',' in number_str:
            number_str = number_str.replace(',', '.')
        try:
            num = float(number_str)
            return f"{num:.2f}".rstrip('0').rstrip('.') if '.' in str(num) else str(int(num))
        except ValueError:
            return number_str
from dateutil import parser as date_parser
from typing import List, Optional, Any

class claude37Round3Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return claude37Round3Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # --- 1. Handle encoding issues ---
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected.get('encoding') or 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        
        # Remove BOM if present and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split into non-empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # --- 2. Detect delimiter ---
        delimiter = claude37Round3Solution.detect_delimiter(lines)
        
        # --- 3. Parse CSV data ---
        rows = claude37Round3Solution.parse_csv_with_delimiter(lines, delimiter)
        if not rows or len(rows) < 1:
            return ""
        
        # --- 4. Normalize header ---
        header = [claude37Round3Solution.normalize_column_name(col) for col in rows[0]]
        
        # --- 5. Process data rows ---
        cleaned_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if all(not cell.strip() for cell in row):
                continue
            
            # Adjust row length to match header length
            row = claude37Round3Solution.adjust_row_length(row, len(header))
            
            # Clean each field based on content and column name
            cleaned_row = []
            for i, field in enumerate(row):
                column_name = header[i] if i < len(header) else f"column_{i}"
                cleaned_row.append(claude37Round3Solution.clean_field(field, column_name))
            
            cleaned_rows.append(cleaned_row)
        
        # --- 6. Write normalized data back to CSV ---
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(cleaned_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter in CSV data by analyzing patterns outside quoted sections.
        
        Args:
            lines: List of CSV data lines
            
        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts = {d: [] for d in common_delimiters}
        
        # Analyze first few lines
        for line in lines[:min(10, len(lines))]:
            for delimiter in common_delimiters:
                # Count delimiters outside quotes
                count = claude37Round3Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)
        
        # Find most consistent delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0
        
        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue
            
            # Score based on consistency and frequency
            counter = Counter(counts)
            most_common, frequency = counter.most_common(1)[0]
            score = frequency * most_common
            
            if score > best_score:
                best_score = score
                best_delimiter = delimiter
        
        # Special case: check for multiple spaces as delimiter
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that appear outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def parse_csv_with_delimiter(lines: List[str], delimiter: str) -> List[List[str]]:
        """
        Parse CSV data using the detected delimiter.
        
        Args:
            lines: List of CSV data lines
            delimiter: The detected delimiter
            
        Returns:
            List of parsed rows
        """
        # Handle space delimiter specially
        if delimiter == r'\s+':
            rows = []
            for line in lines:
                # Split by multiple spaces while preserving quoted content
                fields = []
                current = ""
                in_quotes = False
                quote_char = None
                
                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        current += char
                    elif char.isspace() and not in_quotes and current:
                        if re.match(r'\s{2,}', char):
                            fields.append(current)
                            current = ""
                        else:
                            current += char
                    else:
                        current += char
                
                if current:
                    fields.append(current)
                rows.append(fields)
            return rows
        
        # For standard delimiters, use csv module
        try:
            # First standardize delimiters and quotes
            standardized_lines = []
            for line in lines:
                new_line = ""
                in_quotes = False
                quote_char = None
                
                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                            new_line += '"'  # Standardize to double quotes
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                            new_line += '"'  # Standardize to double quotes
                        else:
                            new_line += char
                    elif char in [',', ';', '\t', '|'] and not in_quotes:
                        new_line += ','  # Standardize to comma
                    else:
                        new_line += char
                
                standardized_lines.append(new_line)
            
            csv_text = '\n'.join(standardized_lines)
            reader = csv.reader(io.StringIO(csv_text))
            return list(reader)
        except Exception:
            # Fallback to manual parsing if csv module fails
            rows = []
            for line in lines:
                fields = []
                current = ""
                in_quotes = False
                quote_char = None
                
                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current += char
                    elif char == delimiter and not in_quotes:
                        fields.append(current)
                        current = ""
                    else:
                        current += char
                
                fields.append(current)
                rows.append(fields)
            
            return rows

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """
        Normalize column name to lowercase with underscores.
        
        Args:
            name: Column name to normalize
            
        Returns:
            Normalized column name
        """
        # Remove quotes and extra whitespace
        name = name.strip()
        if (name.startswith('"') and name.endswith('"')) or (name.startswith("'") and name.endswith("'")):
            name = name[1:-1].strip()
        
        # Convert to lowercase and replace spaces with underscores
        name = re.sub(r'\s+', '_', name.lower())
        
        # Replace non-alphanumeric characters with underscores
        name = re.sub(r'[^\w]', '_', name)
        
        # Remove consecutive underscores
        name = re.sub(r'_+', '_', name)
        
        # Remove leading/trailing underscores
        name = name.strip('_')
        
        return name if name else "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        """
        Adjust row to match expected length.
        
        Args:
            row: Row to adjust
            expected_length: Expected number of fields
            
        Returns:
            Adjusted row
        """
        if len(row) == expected_length:
            return row
        
        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If row is too long, try to intelligently combine fields
        new_row = []
        i = 0
        
        while i < len(row):
            # If we've reached our target length minus 1, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(" ".join(row[i:]))
                break
            
            field = row[i]
            
            # Check if this field starts with a quote but doesn't end with one
            if ((field.startswith('"') and not field.endswith('"')) or 
                (field.startswith("'") and not field.endswith("'"))):
                # Find matching end quote
                combined = field
                j = i + 1
                quote_char = field[0]
                found_end = False
                
                while j < len(row):
                    combined += "," + row[j]  # Add back the delimiter
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1
                
                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1
        
        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]
        
        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str = "") -> str:
        """
        Clean and normalize a field value.
        
        Args:
            field: Field value to clean
            column_name: Name of the column (for type inference)
            
        Returns:
            Cleaned field value
        """
        # Remove outer quotes and trim whitespace
        field = field.strip()
        if len(field) >= 2:
            if (field[0] == field[-1]) and field[0] in ['"', "'"]:
                field = field[1:-1].strip()
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if claude37Round3Solution.looks_like_date_column(column_name) or claude37Round3Solution.looks_like_date(field):
            try:
                date_obj = claude37Round3Solution.normalize_date(field)
                if date_obj:
                    return date_obj
            except Exception:
                pass  # Fall through to other normalizations if date parsing fails
        
        # Try to detect and normalize numeric fields
        if claude37Round3Solution.looks_like_numeric(field):
            try:
                return claude37Round3Solution.normalize_number(field)
            except Exception:
                pass  # Return original if number parsing fails
        
        return field

    @staticmethod
    def looks_like_date_column(column_name: str) -> bool:
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if field value looks like a date."""
        # Common date patterns
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',    # YYYY-MM-DD
            r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # Month DD, YYYY
            r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'   # DD Month YYYY
        ]
        
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def normalize_date(date_str: str) -> str:
        """
        Normalize date to ISO format (YYYY-MM-DD).
        
        Args:
            date_str: Date string to normalize
            
        Returns:
            Normalized date string
        """
        # Remove currency symbols and other non-date characters
        date_str = re.sub(r'[$€£]', '', date_str)
        
        # Handle ordinal suffixes (1st, 2nd, 3rd, 4th)
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
        
        try:
            date_obj = date_parser.parse(date_str, fuzzy=True)
            return date_obj.strftime('%Y-%m-%d')
        except Exception:
            return date_str  # Return original if parsing fails

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if field looks like it contains a number."""
        # Strip currency symbols, spaces, and common separators
        cleaned = re.sub(r'[$€£\s]', '', field)
        return bool(re.match(r'^[+-]?[\d,.]+$', cleaned))

    @staticmethod
    def normalize_number(number_str: str) -> str:
        """
        Normalize numeric format.
        
        Args:
            number_str: Number string to normalize
            
        Returns:
            Normalized number string
        """
        # Remove currency symbols and whitespace
        number_str = re.sub(r'[$€£\s]', '', number_str)
        
        # Handle different number formats
        if ',' in number_str and '.' in number_str:
            # Determine which is the decimal separator
            if number_str.rindex(',') > number_str.rindex('.'):
                # European format: 1.234,56
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                # US format: 1,234.56
                number_str = number_str.replace(',', '')
        elif ',' in number_str:
            # Could be either 1,234 (US) or 1,23 (European)
            if len(number_str.split(',')[-1]) <= 2:
                # Likely European decimal comma
                number_str = number_str.replace(',', '.')
            else:
                # Likely US thousands separator
                number_str = number_str.replace(',', '')
        
        try:
            # Check if it's a valid number
            float(number_str)
            # Return it without trailing zeros after decimal
            if '.' in number_str:
                return str(float(number_str))
            return number_str
        except ValueError:
            # If conversion fails, return original
            return number_str
from typing import List, Optional, Any

class mistral_largeRound3Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return mistral_largeRound3Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.

        Args:
            csv_data: String containing messy CSV data

        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle character encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')

        # Remove BOM if present and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')

        # Split into lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""

        # Detect and standardize delimiters
        delimiter = mistral_largeRound3Solution.detect_delimiter(lines)
        standardized_lines = mistral_largeRound3Solution.standardize_delimiters(lines, delimiter)

        # Parse CSV with detected delimiter
        rows = mistral_largeRound3Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows:
            return ""

        # Normalize header row
        header = [mistral_largeRound3Solution.normalize_column_name(col) for col in rows[0]]

        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue

            # Adjust row length to match header
            adjusted_row = mistral_largeRound3Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue

            # Clean each field
            normalized_row = [
                mistral_largeRound3Solution.clean_field(field, header[i] if i < len(header) else f"column_{i}")
                for i, field in enumerate(adjusted_row)
            ]

            normalized_rows.append(normalized_row)

        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)

        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter in CSV data by analyzing patterns outside quoted sections.

        Args:
            lines: List of CSV data lines

        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}

        for line in lines[:min(10, len(lines))]:  # Check first 10 lines or fewer
            for delimiter in common_delimiters:
                count = mistral_largeRound3Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)

        # Find most consistent non-zero delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0

        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue

            # Calculate consistency (how many lines have the same count)
            count_frequencies = Counter(counts)
            most_common_count, frequency = count_frequencies.most_common(1)[0]
            consistency_score = frequency * most_common_count  # Weight by both frequency and count

            if consistency_score > best_score:
                best_delimiter = delimiter
                best_score = consistency_score

        # Special case: if no clear delimiter is found, check for multiple spaces
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'

        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that are outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False

        for i, char in enumerate(line):
            # Handle escape sequences
            if escaped:
                escaped = False
                continue

            if char == '\\':
                escaped = True
                continue

            # Toggle quote state
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None

            # Count delimiter if outside quotes
            elif char == delimiter and not in_quotes:
                count += 1

        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use the primary delimiter.

        Args:
            lines: List of CSV data lines
            primary_delimiter: The delimiter to standardize to

        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []

        for line in lines:
            # Handle space-delimited lines
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line)
                standardized_lines.append(','.join(fields))
                continue

            # For other delimiters, process quotes properly
            new_line = ""
            in_quotes = False
            quote_char = None

            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char  # Different quote inside quoted text
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char

            standardized_lines.append(new_line)

        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV lines into rows, handling mixed quote styles and other issues.

        Args:
            lines: List of standardized CSV lines
            detected_delimiter: The primary delimiter used in the data

        Returns:
            List of parsed CSV rows
        """
        # Join lines back into a single string
        csv_text = '\n'.join(lines)

        # Use the correct delimiter for parsing
        actual_delimiter = ',' if detected_delimiter == r'\s+' else detected_delimiter

        try:
            # Try parsing with csv module
            reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delimiter)
            rows = list(reader)

            # Post-process to clean up quotes and whitespace
            clean_rows = []
            for row in rows:
                clean_row = []
                for field in row:
                    field = field.strip()
                    # Remove matching outer quotes if present
                    if (field.startswith('"') and field.endswith('"')) or \
                       (field.startswith("'") and field.endswith("'")):
                        field = field[1:-1].strip()
                    clean_row.append(field)
                clean_rows.append(clean_row)

            return clean_rows
        except Exception:
            # Fallback: manual parsing
            rows = []
            for line in lines:
                fields = []
                current_field = ""
                in_quotes = False
                quote_char = None

                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current_field += char
                    elif char == actual_delimiter and not in_quotes:
                        fields.append(current_field.strip())
                        current_field = ""
                    else:
                        current_field += char

                fields.append(current_field.strip())
                rows.append(fields)

            return rows

    @staticmethod
    def normalize_column_name(column: str) -> str:
        """
        Normalize column name to lowercase with underscores.

        Args:
            column: The column name to normalize

        Returns:
            Normalized column name
        """
        # Remove outer quotes if present
        column = column.strip()
        if (column.startswith('"') and column.endswith('"')) or \
           (column.startswith("'") and column.endswith("'")):
            column = column[1:-1].strip()

        # Replace non-alphanumeric with underscores
        normalized = re.sub(r'[^\w\s]', '_', column)
        # Replace whitespace with underscores and convert to lowercase
        normalized = re.sub(r'\s+', '_', normalized).lower()
        # Remove consecutive underscores
        normalized = re.sub(r'_+', '_', normalized)
        # Remove leading/trailing underscores
        normalized = normalized.strip('_')

        return normalized or "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> Optional[List[str]]:
        """
        Adjust row to match expected length.

        Args:
            row: The row to adjust
            expected_length: The expected number of fields

        Returns:
            Adjusted row or None if adjustment is not possible
        """
        if len(row) == expected_length:
            return row

        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))

        # If row is too long, try to combine fields that might have been incorrectly split
        new_row = []
        i = 0
        while i < len(row):
            field = row[i]

            # If we've already reached expected length, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(' '.join(row[i:]))
                break

            # Check if this field starts with a quote but doesn't end with one
            if (field.startswith('"') and not field.endswith('"')) or \
               (field.startswith("'") and not field.endswith("'")):
                # Find the matching end quote
                combined = field
                j = i + 1
                found_end = False
                quote_char = field[0]

                while j < len(row):
                    combined += "," + row[j]  # Add back the comma that was removed during parsing
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1

                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1

        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]

        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize field value based on content and column name.

        Args:
            field: The field value to clean
            column_name: The name of the column this field belongs to

        Returns:
            Cleaned field value
        """
        # Trim whitespace and remove outer quotes
        field = field.strip()
        if (field.startswith('"') and field.endswith('"')) or \
           (field.startswith("'") and field.endswith("'")):
            field = field[1:-1].strip()

        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""

        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"

        # Try to detect and normalize date fields
        if mistral_largeRound3Solution.looks_like_date(field) or mistral_largeRound3Solution.is_date_column(column_name):
            try:
                date_obj = mistral_largeRound3Solution.parse_date(field)
                if date_obj:
                    return date_obj.strftime('%Y-%m-%d')
            except Exception:
                pass  # If date parsing fails, continue with other cleaning

        # Try to detect and normalize numeric fields
        if mistral_largeRound3Solution.looks_like_numeric(field):
            try:
                return mistral_largeRound3Solution.format_number(field)
            except Exception:
                pass  # If number parsing fails, return cleaned string

        # Return the cleaned string
        return field
    
    @staticmethod
    def looks_like_date(field: str) -> bool:
        """
        Check if field looks like a date.
        """
        # Check for common date patterns
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # MM/DD/YYYY or DD/MM/YYYY
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',     # YYYY/MM/DD
            r'\d{1,2}\s+[A-Za-z]{3,}\s+\d{2,4}',  # DD Month YYYY
        ]
        for pattern in date_patterns:
            if re.match(pattern, field):
                return True
        return False

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """
        Check if column name suggests it contains date values.
        """
        date_keywords = ['date', 'day', 'month', 'year', 'time', 'born', 'created', 'modified', 'updated']
        return any(keyword in column_name.lower() for keyword in date_keywords)

    @staticmethod
    def parse_date(field: str) -> Optional[datetime]:
        """
        Parse a date string into a datetime object.
        """
        try:
            return parser.parse(field)
        except:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """
        Check if field looks like a numeric value.
        """
        # Remove currency symbols and thousands separators
        cleaned = re.sub(r'[$€£¥]', '', field)
        cleaned = re.sub(r'[,\s]', '', cleaned)
        # Check if it's a valid number
        return bool(re.match(r'^-?\d+(\.\d+)?$', cleaned))

    @staticmethod
    def format_number(field: str) -> str:
        """
        Format numeric values consistently.
        """
        # Remove currency symbols and thousands separators
        cleaned = re.sub(r'[$€£¥]', '', field)
        cleaned = re.sub(r'[,\s]', '', cleaned)
        
        # Handle decimal numbers
        if '.' in cleaned:
            try:
                value = float(cleaned)
                # If it's a whole number, convert to int
                if value.is_integer():
                    return str(int(value))
                # Otherwise, return with consistent decimal format
                return str(value)
            except ValueError:
                return field
        
        # Handle integers
        try:
            return str(int(cleaned))
        except ValueError:
            return field
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from dateutil import parser as date_parser
from typing import List, Optional

class o3_miniRound4Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return normalize_csv(input_text)

def normalize_csv(csv_data: str) -> str:
    """
    Clean and normalize messy CSV data.
    
    Issues addressed:
      • Ensures correct character encoding (using chardet if needed) and BOM removal
      • Normalizes line endings and splits only non-empty lines
      • Automatically detects delimiters (commas, semicolons, tabs, pipes or multi‐spaces)
      • Standardizes quote style to double quotes and converts all delimiters to commas
      • Uses csv.reader for robust parsing, then normalizes header names (to lowercase with underscores)
      • Adjusts row length when needed and cleans each field—trimming whitespace/quotes,
        converting null-like strings, booleans, dates (to YYYY-MM-DD) and numbers (US/European style)
      • Robust error handling for edge cases.
      
    Args:
        csv_data: String containing messy CSV data (or bytes)
        
    Returns:
        Clean, normalized CSV data as a string.
    """
    
    # 1. Handle encoding (if bytes) and remove BOM; normalize line endings.
    if isinstance(csv_data, bytes):
        detected = chardet.detect(csv_data)
        encoding = detected.get('encoding') or 'utf-8'
        csv_data = csv_data.decode(encoding, errors='replace')
    csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
    
    # 2. Split into nonempty lines.
    lines = [line for line in csv_data.split('\n') if line.strip()]
    if not lines:
        return ""
    
    # 3. Detect the likely delimiter.
    delimiter = detect_delimiter(lines)
    
    # 4. Standardize all lines:
    #    • Convert any common delimiter into a comma.
    #    • Standardize quotes to double quotes.
    standardized_lines = standardize_delimiters(lines, delimiter)
    
    # 5. Parse CSV rows using csv.reader for robust quote handling.
    rows = parse_csv_rows(standardized_lines, delimiter)
    if not rows or len(rows) < 1:
        return ""
    
    # 6. Normalize header names (lowercase, underscores, no extra quotes).
    header = [normalize_column_name(col) for col in rows[0]]
    
    # 7. Process and clean data rows.
    normalized_rows = [header]
    for row in rows[1:]:
        # Skip rows that are entirely empty.
        if not row or all(not cell.strip() for cell in row):
            continue
        
        # Adjust row length: pad if too short, or merge extra fields.
        row = adjust_row_length(row, len(header))
        if not row:
            continue
        
        # Clean each field based on its content and column hints.
        cleaned_row = []
        for i, field in enumerate(row):
            col_name = header[i] if i < len(header) else f"column_{i}"
            cleaned_row.append(clean_field(field, col_name))
        normalized_rows.append(cleaned_row)
    
    # 8. Write normalized rows into output CSV string.
    output = io.StringIO()
    writer = csv.writer(output, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
    writer.writerows(normalized_rows)
    return output.getvalue()


def detect_delimiter(lines: List[str]) -> str:
    """
    Detect the most likely delimiter from candidate symbols (comma, semicolon, tab, pipe).
    We count delimiter characters that occur outside quoted sections in the first few lines.
    
    Returns:
        The delimiter character that scores best (or multi-space regex if none found).
    """
    candidates = [',', ';', '\t', '|']
    delim_counts: Dict[str, List[int]] = {d: [] for d in candidates}
    
    for line in lines[:min(10, len(lines))]:
        for delim in candidates:
            count = count_delimiters_outside_quotes(line, delim)
            if count > 0:
                delim_counts[delim].append(count)
    
    best_delim = ','
    best_score = 0
    for delim, counts in delim_counts.items():
        if not counts:
            continue
        ctr = Counter(counts)
        common_count, frequency = ctr.most_common(1)[0]
        score = frequency * common_count  # combination of consistency and frequency
        if score > best_score:
            best_score = score
            best_delim = delim

    # Special-case: if no clear delimiter found, try multiple spaces as delimiter.
    if best_score == 0:
        for line in lines[:min(5, len(lines))]:
            if re.search(r'\s{2,}', line):
                return r'\s+'
    
    return best_delim

def count_delimiters_outside_quotes(line: str, delim: str) -> int:
    """
    Returns the count of the given delimiter that occurs outside of quoted text.
    Supports both single and double quotes.
    """
    count = 0
    in_quotes = False
    quote_char = None
    for char in line:
        if char in ['"', "'"]:
            if not in_quotes:
                in_quotes = True
                quote_char = char
            elif char == quote_char:
                in_quotes = False
                quote_char = None
        elif char == delim and not in_quotes:
            count += 1
    return count

def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
    """
    Convert all common delimiters in each line into the primary delimiter (a comma),
    and standardize quote characters to double quotes.
    For a multi-space delimiter (r'\s+'), split on two or more spaces.
    """
    standardized = []
    
    for line in lines:
        # Special handling if we use a multi-space delimiter.
        if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
            fields = re.split(r'\s{2,}', line.strip())
            standardized.append(",".join(fields))
            continue
            
        new_line = ""
        in_quotes = False
        quote_char = None
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                    new_line += '"'  # open with double quote
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
                    new_line += '"'  # close with double quote
                else:
                    new_line += char
            elif char in [',', ';', '\t', '|'] and not in_quotes:
                new_line += ','  # use comma as standard delimiter
            else:
                new_line += char
        standardized.append(new_line)
    
    return standardized

def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
    """
    Parse the standardized CSV lines into rows using csv.reader.
    If we are using a space-delimiter fallback, the delimiter is already applied.
    """
    actual_delim = ',' if detected_delimiter == r'\s+' else detected_delimiter
    csv_text = "\n".join(lines)
    try:
        reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delim)
        # Strip whitespace from each field.
        return [[cell.strip() for cell in row] for row in reader]
    except Exception:
        # Fallback: simple splitting if csv.reader fails.
        rows = []
        for line in lines:
            row = line.split(actual_delim)
            rows.append([cell.strip() for cell in row])
        return rows

def normalize_column_name(colname: str) -> str:
    """
    Normalize header column name: trim quotes and whitespace,
    convert to lowercase and replace non-alphanumerics with underscores.
    """
    colname = colname.strip()
    if (colname.startswith('"') and colname.endswith('"')) or (colname.startswith("'") and colname.endswith("'")):
        colname = colname[1:-1].strip()
    
    # Replace non-alphanumeric (except underscore) with underscore.
    colname = re.sub(r'[^\w\s]', '_', colname)
    colname = re.sub(r'\s+', '_', colname)
    colname = re.sub(r'_+', '_', colname)
    return colname.strip('_').lower() or "column"

def adjust_row_length(row: List[str], expected: int) -> List[str]:
    """
    Adjust the row so that it has exactly expected number of fields.
      • If there are too few, pad with empty strings.
      • If there are too many, merge extra fields into the last column.
    """
    if len(row) == expected:
        return row
    if len(row) < expected:
        return row + [""] * (expected - len(row))
    
    # If too many fields, join extra fields into the final column.
    new_row = row[:expected - 1]
    combined = " ".join(row[expected - 1:])
    new_row.append(combined)
    return new_row

def clean_field(field: str, column_name: str) -> str:
    """
    Clean and normalize a single CSV field.
      • Trim whitespace and remove extraneous outer quotes.
      • Treat null-like values as empty.
      • Normalize boolean values.
      • If the field looks like a date (or the column name suggests it), convert it to ISO format.
      • If the field appears numeric, normalize number format.
    """
    field = field.strip()
    # Remove matching outer quotes
    if len(field) >= 2 and field[0] == field[-1] and field[0] in ['"', "'"]:
        field = field[1:-1].strip()
    
    # Handle missing or null-like
    if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
        return ""
    
    # Normalize boolean values
    low = field.lower()
    if low in ['true', 'yes', 'y', '1']:
        return "true"
    if low in ['false', 'no', 'n', '0']:
        return "false"
    
    # If the text looks like a date or the header indicates a date, convert to ISO.
    if looks_like_date(field) or is_date_column(column_name):
        dt = parse_date(field)
        if dt:
            return dt.strftime('%Y-%m-%d')
    
    # If the field looks numeric, try to format it.
    if looks_like_numeric(field):
        num = format_number(field)
        if num is not None:
            return num
    
    return field

def looks_like_date(field: str) -> bool:
    """
    Heuristic to determine if a field's content resembles a date.
    Checks several regex-based date patterns.
    """
    date_patterns = [
        r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',           # 04/25/1991, 25-12-2023
        r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',               # 1991-04-25
        r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # May 3rd, 1992, March 12 1990
        r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'     # 3rd May 1992
    ]
    return any(re.search(p, field) for p in date_patterns)

def is_date_column(colname: str) -> bool:
    """
    Determine if the column name suggests it contains dates.
    """
    indicators = ['date', 'day', 'month', 'year', 'time', 'birth', 'updated', 'created']
    return any(ind in colname.lower() for ind in indicators)

def parse_date(date_str: str) -> Optional[date_parser]:
    """
    Attempt to parse a date string (fuzzy parsing after removing ordinal suffixes).
    Returns a datetime object if successful, else None.
    """
    # Remove ordinal suffixes (e.g. 3rd -> 3)
    cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
    try:
        return date_parser.parse(cleaned, fuzzy=True)
    except Exception:
        return None

def looks_like_numeric(field: str) -> bool:
    """
    Check if field appears to be numeric.
    Removes common currency symbols before checking.
    """
    cleaned = re.sub(r'[$€£\s]', '', field)
    return bool(re.search(r'\d', cleaned))

def format_number(num_str: str) -> Optional[str]:
    """
    Normalize a number string to standard format.
    Handles both US (1,234.56) and European (1.234,56) formats.
    Removes currency symbols and whitespace.
    """
    # Remove currency symbols and spaces.
    temp = re.sub(r'[$€£\s]', '', num_str)
    
    # If both comma and period present, decide which is the decimal marker.
    if ',' in temp and '.' in temp:
        if temp.rfind(',') > temp.rfind('.'):
            # Likely European: remove dots (thousands) and replace comma with period.
            temp = temp.replace('.', '')
            temp = temp.replace(',', '.')
        else:
            # Likely US: remove commas.
            temp = temp.replace(',', '')
    elif ',' in temp and '.' not in temp:
        # Ambiguous: if one comma and two digits after, treat as decimal.
        parts = temp.split(',')
        if len(parts) == 2 and len(parts[1]) in {1, 2}:
            temp = temp.replace(',', '.')
        else:
            temp = temp.replace(',', '')
    
    try:
        number = float(temp)
        # If number has no fractional part, return an integer.
        if number.is_integer():
            return str(int(number))
        else:
            s = f"{number:.6f}".rstrip('0').rstrip('.')
            return s
    except ValueError:
        return None
from dateutil import parser as date_parser
from typing import List, Optional

class gpt4oRound4Solution:
    @staticmethod
    def solve(input_text: str) -> str:
        return gpt4oRound4Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')
        
        # Remove BOM and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect and standardize delimiters
        delimiter = gpt4oRound4Solution.detect_delimiter(lines)
        standardized_lines = gpt4oRound4Solution.standardize_delimiters(lines, delimiter)
        
        # Parse CSV with detected delimiter
        rows = gpt4oRound4Solution.parse_csv_rows(standardized_lines)
        if not rows:
            return ""
        
        # Normalize the header
        header = [gpt4oRound4Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            if not row or all(not cell.strip() for cell in row):
                continue
            adjusted_row = gpt4oRound4Solution.adjust_row_length(row, len(header))
            normalized_row = [gpt4oRound4Solution.clean_field(cell, header[i] if i < len(header) else f"column_{i}") for i, cell in enumerate(adjusted_row)]
            normalized_rows.append(normalized_row)
        
        # Write to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts = {d: [] for d in common_delimiters}
        
        for line in lines[:min(10, len(lines))]:
            for delim in common_delimiters:
                count = gpt4oRound4Solution.count_delimiters_outside_quotes(line, delim)
                delimiter_counts[delim].append(count)
        
        best_delimiter = max(delimiter_counts, key=lambda d: Counter(delimiter_counts[d]).most_common(1)[0][1])
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        count = 0
        in_quotes = False
        quote_char = None
        
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        standardized_lines = []
        
        for line in lines:
            new_line = ""
            in_quotes = False
            quote_char = None
            
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'
                    else:
                        new_line += char
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','
                else:
                    new_line += char
                    
            standardized_lines.append(new_line)
        
        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str]) -> List[List[str]]:
        csv_text = '\n'.join(lines)
        reader = csv.reader(io.StringIO(csv_text), delimiter=',')
        return [row for row in reader]

    @staticmethod
    def normalize_column_name(column: str) -> str:
        column = column.strip().lower()
        column = re.sub(r'[^\w\s]', '_', column)
        column = re.sub(r'\s+', '_', column)
        return column.strip('_')

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        elif len(row) > expected_length:
            return row[:expected_length]
        return row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        field = field.strip()
        if len(field) >= 2 and ((field.startswith('"') and field.endswith('"')) or (field.startswith("'") and field.endswith("'"))):
            field = field[1:-1].strip()
        
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        if gpt4oRound4Solution.looks_like_date(field) or gpt4oRound4Solution.is_date_column(column_name):
            date_obj = gpt4oRound4Solution.parse_date(field)
            if date_obj:
                return date_obj.strftime('%Y-%m-%d')
        
        if gpt4oRound4Solution.looks_like_numeric(field):
            return gpt4oRound4Solution.format_number(field)
        
        return field

    @staticmethod
    def looks_like_date(field: str) -> bool:
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
            r'[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{2,4}',
            r'\d{1,2}\s+[A-Za-z]{3,9},?\s+\d{2,4}'
        ]
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def parse_date(date_str: str) -> Optional[date_parser.parser]:
        try:
            return date_parser.parse(date_str, fuzzy=True)
        except (date_parser.ParserError, ValueError):
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        cleaned = re.sub(r'[^\d.,-]', '', field)
        return bool(re.match(r'^[+-]?\d+([.,]\d+)*$', cleaned))

    @staticmethod
    def format_number(number_str: str) -> str:
        number_str = re.sub(r'[^\d.,-]', '', number_str)
        if ',' in number_str and '.' in number_str:
            if number_str.rindex(',') > number_str.rindex('.'):
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                number_str = number_str.replace(',', '')
        elif ',' in number_str:
            number_str = number_str.replace(',', '.')
        try:
            num = float(number_str)
            return f"{num:.2f}".rstrip('0').rstrip('.') if '.' in str(num) else str(int(num))
        except ValueError:
            return number_str
from dateutil import parser as date_parser
from typing import List, Optional, Any, Union

class claude37Round4Solution:
    @staticmethod
    def solve(input_text):
        return claude37Round4Solution.normalize_csv(input_text)

    @staticmethod
    def normalize_csv(csv_data: Union[str, bytes]) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String or bytes containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected.get('encoding') or 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        
        # Remove BOM and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split into non-empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect delimiter and standardize lines
        delimiter = claude37Round4Solution.detect_delimiter(lines)
        standardized_lines = claude37Round4Solution.standardize_delimiters(lines, delimiter)
        
        # Parse CSV rows
        rows = claude37Round4Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows or len(rows) < 1:
            return ""
        
        # Normalize header row
        header = [claude37Round4Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue
            
            # Adjust row length to match header
            adjusted_row = claude37Round4Solution.adjust_row_length(row, len(header))
            
            # Clean each field based on content and column name
            cleaned_row = []
            for i, field in enumerate(adjusted_row):
                column_name = header[i] if i < len(header) else f"column_{i}"
                cleaned_row.append(claude37Round4Solution.clean_field(field, column_name))
            
            normalized_rows.append(cleaned_row)
        
        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter by analyzing patterns outside quoted sections.
        
        Args:
            lines: List of CSV data lines
            
        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}
        
        # Analyze first few lines
        for line in lines[:min(10, len(lines))]:
            for delimiter in common_delimiters:
                count = claude37Round4Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)
        
        # Find most consistent delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0
        
        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue
            
            # Score based on consistency and frequency
            counter = Counter(counts)
            most_common, frequency = counter.most_common(1)[0]
            score = frequency * most_common
            
            if score > best_score:
                best_score = score
                best_delimiter = delimiter
        
        # Special case: check for multiple spaces as delimiter
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that appear outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False
        
        for char in line:
            if escaped:
                escaped = False
                continue
                
            if char == '\\':
                escaped = True
                continue
                
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use comma as delimiter and double quotes for quoting.
        
        Args:
            lines: List of CSV data lines
            primary_delimiter: The detected delimiter
            
        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []
        
        for line in lines:
            # Special handling for space-delimited data
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line.strip())
                standardized_lines.append(','.join(f'"{f}"' if ',' in f else f for f in fields))
                continue
                
            new_line = ""
            in_quotes = False
            quote_char = None
            
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char  # Different quote inside quoted text
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char
            
            standardized_lines.append(new_line)
        
        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV lines into rows, handling quotes properly.
        
        Args:
            lines: List of standardized CSV lines
            detected_delimiter: The detected delimiter
            
        Returns:
            List of parsed CSV rows
        """
        csv_text = '\n'.join(lines)
        
        try:
            reader = csv.reader(io.StringIO(csv_text))
            return list(reader)
        except Exception:
            # Fallback to manual parsing if csv module fails
            rows = []
            for line in lines:
                fields = []
                current = ""
                in_quotes = False
                quote_char = None
                
                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current += char
                    elif char == ',' and not in_quotes:
                        fields.append(current)
                        current = ""
                    else:
                        current += char
                
                if current or len(fields) > 0:
                    fields.append(current)
                    rows.append(fields)
            
            return rows

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """
        Normalize column name to lowercase with underscores.
        
        Args:
            name: Column name to normalize
            
        Returns:
            Normalized column name
        """
        # Remove quotes and extra whitespace
        name = name.strip()
        if (name.startswith('"') and name.endswith('"')) or (name.startswith("'") and name.endswith("'")):
            name = name[1:-1].strip()
        
        # Convert to lowercase and replace non-alphanumeric with underscores
        name = re.sub(r'[^\w\s]', '_', name.lower())
        name = re.sub(r'\s+', '_', name)
        
        # Remove consecutive and trailing underscores
        name = re.sub(r'_+', '_', name)
        name = name.strip('_')
        
        return name if name else "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        """
        Adjust row to match expected length by padding or combining fields.
        
        Args:
            row: Row to adjust
            expected_length: Expected number of fields
            
        Returns:
            Adjusted row
        """
        if len(row) == expected_length:
            return row
        
        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If row is too long, try to intelligently combine fields
        new_row = []
        i = 0
        
        while i < len(row):
            # If we've reached our target length minus 1, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(" ".join(row[i:]))
                break
            
            field = row[i]
            
            # Check if this field starts with a quote but doesn't end with one
            if ((field.startswith('"') and not field.endswith('"')) or 
                (field.startswith("'") and not field.endswith("'"))):
                # Find matching end quote
                combined = field
                j = i + 1
                quote_char = field[0]
                found_end = False
                
                while j < len(row):
                    combined += "," + row[j]  # Add back the delimiter
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1
                
                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1
        
        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]
        
        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize a field value based on content and column name.
        
        Args:
            field: Field value to clean
            column_name: Name of the column (for type inference)
            
        Returns:
            Cleaned field value
        """
        # Remove outer quotes and trim whitespace
        field = field.strip()
        if len(field) >= 2:
            if (field[0] == field[-1]) and field[0] in ['"', "'"]:
                field = field[1:-1].strip()
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if claude37Round4Solution.is_date_column(column_name) or claude37Round4Solution.looks_like_date(field):
            date_obj = claude37Round4Solution.parse_date(field)
            if date_obj:
                return date_obj.strftime('%Y-%m-%d')
        
        # Try to detect and normalize numeric fields
        if claude37Round4Solution.looks_like_numeric(field):
            normalized_number = claude37Round4Solution.format_number(field)
            if normalized_number is not None:
                return normalized_number
        
        return field

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if field value looks like a date."""
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',    # YYYY-MM-DD
            r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # Month DD, YYYY
            r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'   # DD Month YYYY
        ]
        
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def parse_date(date_str: str) -> Optional[Any]:
        """
        Parse date string to datetime object.
        
        Args:
            date_str: Date string to parse
            
        Returns:
            Datetime object or None if parsing fails
        """
        try:
            # Remove ordinal suffixes (1st, 2nd, 3rd, etc.)
            cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
            return date_parser.parse(cleaned, fuzzy=True)
        except Exception:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if field looks like it contains a number."""
        # Remove currency symbols and spaces
        cleaned = re.sub(r'[$€£\s]', '', field)
        # Check if it has digits and optionally decimal/thousand separators
        return bool(re.match(r'^[+-]?[\d,.]+$', cleaned))

    @staticmethod
    def format_number(number_str: str) -> Optional[str]:
        """
        Normalize number format by handling different separators.
        
        Args:
            number_str: Number string to normalize
            
        Returns:
            Normalized number string or None if parsing fails
        """
        # Remove currency symbols and spaces
        cleaned = re.sub(r'[$€£\s]', '', number_str)
        
        # Handle different number formats
        if ',' in cleaned and '.' in cleaned:
            # Determine which is the decimal separator
            if cleaned.rindex(',') > cleaned.rindex('.'):
                # European format (1.234,56)
                cleaned = cleaned.replace('.', '').replace(',', '.')
            else:
                # US format (1,234.56)
                cleaned = cleaned.replace(',', '')
        elif ',' in cleaned:
            # Could be either 1,234 (US) or 1,23 (European)
            # If comma is followed by exactly 2 digits, likely European decimal
            if re.search(r',\d{2}$', cleaned):
                cleaned = cleaned.replace(',', '.')
            else:
                cleaned = cleaned.replace(',', '')
        
        try:
            num = float(cleaned)
            # Format integers without decimal point, floats with up to 6 digits
            if num.is_integer():
                return str(int(num))
            else:
                # Remove trailing zeros
                return str(num).rstrip('0').rstrip('.') if '.' in str(num) else str(num)
        except ValueError:
            return None
from dateutil import parser as date_parser
from typing import List, Optional

class mistral_largeRound4Solution:
    @staticmethod
    def solve(input_text):
        return mistral_largeRound4Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: str) -> str:
        """
        Clean and normalize messy CSV data.

        Args:
            csv_data: String containing messy CSV data

        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle character encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')

        # Remove BOM if present and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')

        # Split into lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""

        # Detect and standardize delimiters
        delimiter = mistral_largeRound4Solution.detect_delimiter(lines)
        standardized_lines = mistral_largeRound4Solution.standardize_delimiters(lines, delimiter)

        # Parse CSV with detected delimiter
        rows = mistral_largeRound4Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows:
            return ""

        # Normalize header row
        header = [mistral_largeRound4Solution.normalize_column_name(col) for col in rows[0]]

        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue

            # Adjust row length to match header
            adjusted_row = mistral_largeRound4Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue

            # Clean each field
            normalized_row = [
                mistral_largeRound4Solution.clean_field(field, header[i] if i < len(header) else f"column_{i}")
                for i, field in enumerate(adjusted_row)
            ]

            normalized_rows.append(normalized_row)

        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)

        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter in CSV data by analyzing patterns outside quoted sections.

        Args:
            lines: List of CSV data lines

        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}

        for line in lines[:min(10, len(lines))]:  # Check first 10 lines or fewer
            for delimiter in common_delimiters:
                count = mistral_largeRound4Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)

        # Find most consistent non-zero delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0

        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue

            # Calculate consistency (how many lines have the same count)
            count_frequencies = Counter(counts)
            most_common_count, frequency = count_frequencies.most_common(1)[0]
            consistency_score = frequency * most_common_count  # Weight by both frequency and count

            if consistency_score > best_score:
                best_delimiter = delimiter
                best_score = consistency_score

        # Special case: if no clear delimiter is found, check for multiple spaces
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'

        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that are outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False

        for i, char in enumerate(line):
            # Handle escape sequences
            if escaped:
                escaped = False
                continue

            if char == '\\':
                escaped = True
                continue

            # Toggle quote state
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None

            # Count delimiter if outside quotes
            elif char == delimiter and not in_quotes:
                count += 1

        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use the primary delimiter.

        Args:
            lines: List of CSV data lines
            primary_delimiter: The delimiter to standardize to

        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []

        for line in lines:
            # Handle space-delimited lines
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line)
                standardized_lines.append(','.join(fields))
                continue

            # For other delimiters, process quotes properly
            new_line = ""
            in_quotes = False
            quote_char = None

            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char  # Different quote inside quoted text
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char

            standardized_lines.append(new_line)

        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV lines into rows, handling mixed quote styles and other issues.

        Args:
            lines: List of standardized CSV lines
            detected_delimiter: The primary delimiter used in the data

        Returns:
            List of parsed CSV rows
        """
        # Join lines back into a single string
        csv_text = '\n'.join(lines)

        # Use the correct delimiter for parsing
        actual_delimiter = ',' if detected_delimiter == r'\s+' else detected_delimiter

        try:
            # Try parsing with csv module
            reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delimiter)
            rows = list(reader)

            # Post-process to clean up quotes and whitespace
            clean_rows = []
            for row in rows:
                clean_row = []
                for field in row:
                    field = field.strip()
                    # Remove matching outer quotes if present
                    if (field.startswith('"') and field.endswith('"')) or \
                    (field.startswith("'") and field.endswith("'")):
                        field = field[1:-1].strip()
                    clean_row.append(field)
                clean_rows.append(clean_row)

            return clean_rows
        except Exception:
            # Fallback: manual parsing
            rows = []
            for line in lines:
                fields = []
                current_field = ""
                in_quotes = False
                quote_char = None

                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current_field += char
                    elif char == actual_delimiter and not in_quotes:
                        fields.append(current_field.strip())
                        current_field = ""
                    else:
                        current_field += char

                fields.append(current_field.strip())
                rows.append(fields)

            return rows

    @staticmethod
    def normalize_column_name(column: str) -> str:
        """
        Normalize column name to lowercase with underscores.

        Args:
            column: The column name to normalize

        Returns:
            Normalized column name
        """
        # Remove outer quotes if present
        column = column.strip()
        if (column.startswith('"') and column.endswith('"')) or \
        (column.startswith("'") and column.endswith("'")):
            column = column[1:-1].strip()

        # Replace non-alphanumeric with underscores
        normalized = re.sub(r'[^\w\s]', '_', column)
        # Replace whitespace with underscores and convert to lowercase
        normalized = re.sub(r'\s+', '_', normalized).lower()
        # Remove consecutive underscores
        normalized = re.sub(r'_+', '_', normalized)
        # Remove leading/trailing underscores
        normalized = normalized.strip('_')

        return normalized or "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> Optional[List[str]]:
        """
        Adjust row to match expected length.

        Args:
            row: The row to adjust
            expected_length: The expected number of fields

        Returns:
            Adjusted row or None if adjustment is not possible
        """
        if len(row) == expected_length:
            return row

        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))

        # If row is too long, try to combine fields that might have been incorrectly split
        new_row = []
        i = 0
        while i < len(row):
            field = row[i]

            # If we've already reached expected length, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(' '.join(row[i:]))
                break

            # Check if this field starts with a quote but doesn't end with one
            if (field.startswith('"') and not field.endswith('"')) or \
            (field.startswith("'") and not field.endswith("'")):
                # Find the matching end quote
                combined = field
                j = i + 1
                found_end = False
                quote_char = field[0]

                while j < len(row):
                    combined += "," + row[j]  # Add back the comma that was removed during parsing
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1

                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1

        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]

        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize field value based on content and column name.

        Args:
            field: The field value to clean
            column_name: The name of the column this field belongs to

        Returns:
            Cleaned field value
        """
        # Trim whitespace and remove outer quotes
        field = field.strip()
        if (field.startswith('"') and field.endswith('"')) or \
        (field.startswith("'") and field.endswith("'")):
            field = field[1:-1].strip()

        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""

        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"

        # Try to detect and normalize date fields
        if mistral_largeRound4Solution.looks_like_date(field) or mistral_largeRound4Solution.is_date_column(column_name):
            try:
                date_obj = mistral_largeRound4Solution.parse_date(field)
                if date_obj:
                    return date_obj.strftime('%Y-%m-%d')
            except Exception:
                pass  # If date parsing fails, continue with other cleaning

        # Try to detect and normalize numeric fields
        if mistral_largeRound4Solution.looks_like_numeric(field):
            try:
                return mistral_largeRound4Solution.format_number(field)
            except Exception:
                pass  # If number parsing fails, return cleaned string

        return field
        
    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if a field looks like a date."""
        # Common date patterns
        date_patterns = [
            r'\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}',  # DD/MM/YYYY or MM/DD/YYYY
            r'\d{4}[/\-\.]\d{1,2}[/\-\.]\d{1,2}',    # YYYY/MM/DD
            r'\d{1,2}[ -][A-Za-z]{3,9}[ -]\d{2,4}',  # DD Month YYYY
            r'[A-Za-z]{3,9}[ -]\d{1,2}[ -,]\d{2,4}'  # Month DD, YYYY
        ]
        
        for pattern in date_patterns:
            if re.search(pattern, field):
                return True
        return False

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'birthday', 'dob', 'created', 'updated']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def parse_date(date_str: str):
        """Parse date string using dateutil parser."""
        try:
            return date_parser.parse(date_str, fuzzy=True)
        except:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if a field looks like a number."""
        # Strip currency symbols and other common number formatting
        cleaned = re.sub(r'[,$%€£¥\s]', '', field)
        # Check if it's a valid number representation
        try:
            float(cleaned)
            return True
        except ValueError:
            return False

    @staticmethod
    def format_number(field: str) -> str:
        """Format numeric field consistently."""
        # Strip currency symbols and other formatting
        cleaned = re.sub(r'[,$%€£¥\s]', '', field)
        
        # Convert to float and format
        value = float(cleaned)
        if value.is_integer():
            return str(int(value))
        else:
            # Use a reasonable precision
            return str(round(value, 6)).rstrip('0').rstrip('.') if '.' in str(round(value, 6)) else str(int(value))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from dateutil import parser as date_parser
from typing import List, Optional, Union, Any

class o3_miniRound5Solution:
    @staticmethod
    def solve(input_text):
        return o3_miniRound5Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: Union[str, bytes]) -> str:
        """
        Clean and normalize messy CSV data.

        The function performs the following:
          1. Handles character encoding (detect bytes and BOM, normalizes line-endings).
          2. Splits the input into non-empty lines.
          3. Detects the delimiter by counting occurrences outside quotes.
          4. Standardizes lines so that all delimiters become commas and quotes become double quotes.
          5. Parses the CSV data, normalizes header names (lowercased, underscores), and adjusts row lengths.
          6. Cleans each field by trimming extra whitespace/quotes, converting null/boolean values,
             normalizing dates to ISO format, and converting numeric formats.
          7. Reconstructs the data using csv.writer to guarantee proper escaping.

        Args:
            csv_data: CSV data as a string or bytes.

        Returns:
            Clean, normalized CSV data as a string.
        """
        # 1. Handle encoding (works even if csv_data is bytes) and remove BOM.
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected.get('encoding') or 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # 2. Split into non-empty lines.
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # 3. Detect delimiter.
        delimiter = o3_miniRound5Solution.detect_delimiter(lines)
        
        # 4. Standardize the delimiters and quote characters.
        standardized_lines = o3_miniRound5Solution.standardize_delimiters(lines, delimiter)
        
        # 5. Parse CSV rows.
        rows = o3_miniRound5Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows or len(rows) < 1:
            return ""
        
        # 6. Normalize header row.
        header = [o3_miniRound5Solution.normalize_column_name(col) for col in rows[0]]
        
        # 7. Process and clean each row.
        normalized_rows = [header]
        for row in rows[1:]:
            if not row or all(not cell.strip() for cell in row):
                continue  # skip completely blank rows
            
            adjusted = o3_miniRound5Solution.adjust_row_length(row, len(header))
            if not adjusted:
                continue
            
            cleaned = [o3_miniRound5Solution.clean_field(adjusted[i], header[i] if i < len(header) else f"column_{i}") 
                          for i in range(len(adjusted))]
            normalized_rows.append(cleaned)
        
        # 8. Write back to CSV string.
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Analyze the first few lines and count candidate delimiters outside quotes.
        Returns the most consistent delimiter; if none is found, fallback to multiple spaces.

        Args:
            lines: List of input CSV lines.
            
        Returns:
            A delimiter character or pattern (r'\s+') if multiple spaces seem to be used.
        """
        common_delimiters = [',', ';', '\t', '|']
        delim_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}
        
        for line in lines[:min(10, len(lines))]:
            for delim in common_delimiters:
                cnt = o3_miniRound5Solution.count_delimiters_outside_quotes(line, delim)
                if cnt > 0:
                    delim_counts[delim].append(cnt)
        
        best_delimiter = ','
        best_score = 0
        for delim, counts in delim_counts.items():
            if not counts:
                continue
            freq = Counter(counts).most_common(1)[0]  # (most_common_count, frequency)
            count_val, frequency = freq
            score = count_val * frequency  # combination of count and consistency
            if score > best_score:
                best_score = score
                best_delimiter = delim

        # Fallback if no traditional delimiter is found: check for multiple spaces.
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """
        Count the occurrences of a delimiter outside quoted parts.
        
        Args:
            line: A single CSV line.
            delimiter: The delimiter candidate.
        
        Returns:
            The count of delimiter occurrences outside any quotes.
        """
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False
        
        for char in line:
            if escaped:
                escaped = False
                continue
            if char == '\\':
                escaped = True
                continue
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Convert all delimiters in every line to comma and turn any quotes into standard double quotes.
        
        If the detected delimiter is a multi-space pattern (r'\s+'),
        then split on two or more spaces.
        
        Args:
            lines: List of CSV line strings.
            primary_delimiter: The detected delimiter from detect_delimiter().
        
        Returns:
            List of standardized CSV line strings.
        """
        standardized = []
        for line in lines:
            # When using multi-space-delimiter:
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                # Split when two or more spaces occur.
                fields = re.split(r'\s{2,}', line.strip())
                standardized.append(",".join(fields))
                continue
            
            new_line = ""
            in_quotes = False
            quote_char = None
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # start double quote
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # close with double quote
                    else:
                        new_line += char
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # force comma delimiter
                else:
                    new_line += char
            standardized.append(new_line)
        return standardized

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV rows using Python's csv.reader. If the delimiter is r'\s+', treat the data as already separated.
        
        Args:
            lines: List of standardized CSV lines.
            detected_delimiter: The delimiter detected.
        
        Returns:
            List of rows, where each row is a list of cell values.
        """
        csv_text = "\n".join(lines)
        actual_delim = ',' if detected_delimiter == r'\s+' else detected_delimiter
        try:
            reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delim)
            # Also strip extra whitespace from each cell.
            return [[cell.strip() for cell in row] for row in reader]
        except Exception:
            # Fallback: manually split each line.
            rows = []
            for line in lines:
                row = [cell.strip() for cell in line.split(actual_delim)]
                rows.append(row)
            return rows

    @staticmethod
    def normalize_column_name(column: str) -> str:
        """
        Normalize header names: remove extra quotes/whitespace, convert to lowercase,
        and replace non-alphanumerics with underscores.
        
        Args:
            column: The original column name.
        
        Returns:
            Normalized column name.
        """
        column = column.strip()
        if (column.startswith('"') and column.endswith('"')) or (column.startswith("'") and column.endswith("'")):
            column = column[1:-1].strip()
        normalized = re.sub(r'[^\w\s]', '_', column)
        normalized = re.sub(r'\s+', '_', normalized).lower()
        normalized = re.sub(r'_+', '_', normalized).strip('_')
        return normalized or "column"

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> Optional[List[str]]:
        """
        Ensure the row has exactly expected_length fields:
          - If too few, pad with empty strings.
          - If too many, try to combine fields (e.g. if quoted text was split).
        
        Args:
            row: The parsed row.
            expected_length: The number of header columns.
        
        Returns:
            A row list with exactly expected_length elements.
        """
        if len(row) == expected_length:
            return row
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # For too many fields, try to combine extra fields into the last column.
        new_row = row[:expected_length-1]
        combined = " ".join(row[expected_length-1:])
        new_row.append(combined)
        if len(new_row) == expected_length:
            return new_row
        return new_row[:expected_length]

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean a field by:
          - Removing extra whitespace and outer quotes.
          - Converting common null (or missing) values to empty strings.
          - Normalizing booleans.
          - Converting dates (if the text looks like a date or the column indicates a date) to ISO format.
          - Converting numeric values (US or European style) to a standardized format.
        
        Args:
            field: The cell value to clean.
            column_name: The name of its column (for type hints).
        
        Returns:
            The cleaned field value.
        """
        field = field.strip()
        # If field is wrapped in quotes, remove them.
        if len(field) >= 2 and field[0] == field[-1] and field[0] in ['"', "'"]:
            field = field[1:-1].strip()
        
        # Empty and null-like values.
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize booleans.
        low_field = field.lower()
        if low_field in ['true', 'yes', 'y', '1']:
            return "true"
        if low_field in ['false', 'no', 'n', '0']:
            return "false"
        
        # If the field looks like a date or the header indicates a date, try to parse it.
        if o3_miniRound5Solution.is_date_column(column_name) or o3_miniRound5Solution.looks_like_date(field):
            dt = o3_miniRound5Solution.parse_date(field)
            if dt:
                return dt.strftime('%Y-%m-%d')
        
        # If the field looks numeric, try to format it.
        if o3_miniRound5Solution.looks_like_numeric(field):
            num = o3_miniRound5Solution.format_number(field)
            if num is not None:
                return num
        
        return field

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """
        Heuristic: if the column name contains words suggestive of dates, return True.
        
        Args:
            column_name: Name of the column.
            
        Returns:
            True if the column is likely a date.
        """
        keywords = ['date', 'day', 'month', 'year', 'time', 'birth', 'updated', 'created']
        return any(kw in column_name.lower() for kw in keywords)

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """
        Use regular expressions to guess if a field appears to contain a date.
        
        Args:
            field: The cell content.
        
        Returns:
            True if it matches typical date patterns.
        """
        patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',       # e.g. 04/25/1991 or 25-12-2023
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',          # e.g. 1991-04-25
            r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # e.g. May 3rd, 1992
            r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'     # e.g. 3rd May 1992
        ]
        return any(re.search(p, field) for p in patterns)

    @staticmethod
    def parse_date(date_str: str) -> Optional[Any]:
        """
        Parse a date string using dateutil's parser (with fuzzy matching).
        Also removes ordinal suffixes (e.g., "3rd" becomes "3").
        
        Args:
            date_str: A date string.
        
        Returns:
            A datetime object if successful or None.
        """
        cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
        try:
            return date_parser.parse(cleaned, fuzzy=True)
        except Exception:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """
        Determine if a field appears to have a numeric value.
        Currency symbols and spaces are removed first.
        
        Args:
            field: The cell content.
        
        Returns:
            True if the field contains digits and allowed numeric punctuation.
        """
        cleaned = re.sub(r'[$€£\s]', '', field)
        return bool(re.match(r'^[+-]?\d+([.,]\d+)*$', cleaned))

    @staticmethod
    def format_number(num_str: str) -> Optional[str]:
        """
        Normalize numbers with both US and European formats.
        
        Steps:
          - Remove currency symbols and spaces.
          - If both comma and period exist, decide which one is the decimal separator.
          - Convert the cleaned string to float and then back to a string (dropping unnecessary zeros).
        
        Args:
            num_str: The numeric string.
            
        Returns:
            The normalized number string or None on failure.
        """
        cleaned = re.sub(r'[$€£\s]', '', num_str)
        # If both delimiters exist, decide based on which comes last.
        if ',' in cleaned and '.' in cleaned:
            if cleaned.rfind(',') > cleaned.rfind('.'):
                cleaned = cleaned.replace('.', '').replace(',', '.')
            else:
                cleaned = cleaned.replace(',', '')
        elif ',' in cleaned and '.' not in cleaned:
            # If one comma is followed by exactly 2 digits, assume decimal; otherwise remove comma.
            if re.search(r',\d{1,2}$', cleaned):
                cleaned = cleaned.replace(',', '.')
            else:
                cleaned = cleaned.replace(',', '')
        try:
            number = float(cleaned)
            if number.is_integer():
                return str(int(number))
            else:
                # Format to remove trailing zeros while keeping up to 6 decimals.
                formatted = f"{number:.6f}".rstrip('0').rstrip('.')
                return formatted
        except ValueError:
            return None
from dateutil import parser as date_parser
from typing import List, Optional, Union

class gpt4oRound5Solution:
    @staticmethod
    def solve(input_text):
        return gpt4oRound5Solution.normalize_csv(input_text)
        
    @staticmethod
    def normalize_csv(csv_data: Union[str, bytes]) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String or bytes containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            csv_data = csv_data.decode(detected['encoding'] or 'utf-8', errors='replace')
        
        # Remove BOM and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split into non-empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""

        # Detect and standardize delimiters
        delimiter = gpt4oRound5Solution.detect_delimiter(lines)
        standardized_lines = gpt4oRound5Solution.standardize_delimiters(lines, delimiter)
        
        # Parse CSV rows
        rows = gpt4oRound5Solution.parse_csv_rows(standardized_lines)
        if not rows or len(rows) < 1:
            return ""
        
        # Normalize header row
        header = [gpt4oRound5Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue
            
            # Adjust row length to match header
            adjusted_row = gpt4oRound5Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue
            
            # Clean each field based on content and column name
            cleaned_row = [
                gpt4oRound5Solution.clean_field(field, header[i] if i < len(header) else f"column_{i}")
                for i, field in enumerate(adjusted_row)
            ]
            
            normalized_rows.append(cleaned_row)
        
        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter by analyzing patterns outside quoted sections.
        
        Args:
            lines: List of CSV data lines
            
        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}
        
        # Analyze first few lines
        for line in lines[:min(10, len(lines))]:
            for delimiter in common_delimiters:
                count = gpt4oRound5Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)
        
        # Find most consistent delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0
        
        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue
            
            # Score based on consistency and frequency
            counter = Counter(counts)
            most_common, frequency = counter.most_common(1)[0]
            score = frequency * most_common
            
            if score > best_score:
                best_score = score
                best_delimiter = delimiter
        
        # Special case: check for multiple spaces as delimiter
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that appear outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        
        for char in line:
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use comma as delimiter and double quotes for quoting.
        
        Args:
            lines: List of CSV data lines
            primary_delimiter: The detected delimiter
            
        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []
        
        for line in lines:
            # Special handling for space-delimited data
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line.strip())
                standardized_lines.append(','.join(fields))
                continue
                
            new_line = ""
            in_quotes = False
            quote_char = None
            
            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char
            
            standardized_lines.append(new_line)
        
        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str]) -> List[List[str]]:
        """
        Parse the standardized CSV lines into rows using csv.reader.
        
        Args:
            lines: List of standardized CSV data lines
            
        Returns:
            List of parsed CSV rows
        """
        csv_text = '\n'.join(lines)
        reader = csv.reader(io.StringIO(csv_text), delimiter=',')
        return [row for row in reader]

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """
        Normalize column name to lowercase with underscores.
        
        Args:
            name: Column name to normalize
            
        Returns:
            Normalized column name
        """
        # Remove quotes and extra whitespace
        name = name.strip()
        if (name.startswith('"') and name.endswith('"')) or (name.startswith("'") and name.endswith("'")):
            name = name[1:-1].strip()
        
        # Convert to lowercase and replace non-alphanumeric with underscores
        name = re.sub(r'[^\w\s]', '_', name.lower())
        name = re.sub(r'\s+', '_', name)
        
        # Remove consecutive and trailing underscores
        name = re.sub(r'_+', '_', name)
        name = name.strip('_')
        
        return name if name else "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> List[str]:
        """
        Adjust row to match expected length by padding or combining fields.
        
        Args:
            row: Row to adjust
            expected_length: Expected number of fields
            
        Returns:
            Adjusted row
        """
        if len(row) == expected_length:
            return row
        
        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If row is too long, combine extra fields
        new_row = row[:expected_length - 1]
        combined = " ".join(row[expected_length - 1:])
        new_row.append(combined)
        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize a single CSV field.
        
        Args:
            field: Field value to clean
            column_name: Name of the column (for type inference)
            
        Returns:
            Cleaned field value
        """
        # Remove outer quotes and trim whitespace
        field = field.strip()
        if len(field) >= 2 and ((field.startswith('"') and field.endswith('"')) or (field.startswith("'") and field.endswith("'"))):
            field = field[1:-1].strip()
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if gpt4oRound5Solution.is_date_column(column_name) or gpt4oRound5Solution.looks_like_date(field):
            date_obj = gpt4oRound5Solution.parse_date(field)
            if date_obj:
                return date_obj.strftime('%Y-%m-%d')
        
        # Try to detect and normalize numeric fields
        if gpt4oRound5Solution.looks_like_numeric(field):
            normalized_number = gpt4oRound5Solution.format_number(field)
            if normalized_number is not None:
                return normalized_number
        
        return field

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if field value looks like a date."""
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',    # YYYY-MM-DD
            r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # Month DD, YYYY
            r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'   # DD Month YYYY
        ]
        
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def parse_date(date_str: str) -> Optional[date_parser.parser]:
        """Attempt to parse a date string to a datetime object."""
        try:
            # Remove ordinal suffixes (e.g., 1st, 2nd, 3rd)
            cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
            return date_parser.parse(cleaned, fuzzy=True)
        except (date_parser.ParserError, ValueError):
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if field appears to be numeric."""
        cleaned = re.sub(r'[$€£\s]', '', field)
        return bool(re.search(r'\d', cleaned))

    @staticmethod
    def format_number(number_str: str) -> Optional[str]:
        """
        Normalize a number string to standard format.
        
        Args:
            number_str: Number string to normalize
            
        Returns:
            Normalized number string or None if parsing fails
        """
        # Remove currency symbols and spaces
        number_str = re.sub(r'[$€£\s]', '', number_str)
        
        # Handle different number formats
        if ',' in number_str and '.' in number_str:
            if number_str.rindex(',') > number_str.rindex('.'):
                # European format: remove dots and replace last comma with dot
                number_str = number_str.replace('.', '').replace(',', '.')
            else:
                # US format: remove commas
                number_str = number_str.replace(',', '')
        elif ',' in number_str:
            # If one comma and two digits after, treat as decimal
            parts = number_str.split(',')
            if len(parts) == 2 and len(parts[1]) in {1, 2}:
                number_str = number_str.replace(',', '.')
            else:
                number_str = number_str.replace(',', '')
        
        try:
            num = float(number_str)
            return f"{num:.2f}".rstrip('0').rstrip('.') if '.' in str(num) else str(int(num))
        except ValueError:
            return number_str
from dateutil import parser as date_parser
from typing import List, Optional, Union, Any

class claude37Round5Solution:
    @staticmethod
    def solve(input_text: Union[str, bytes]) -> str:
        return claude37Round5Solution.normalize_csv(input_text)
    
    @staticmethod
    def normalize_csv(csv_data: Union[str, bytes]) -> str:
        """
        Clean and normalize messy CSV data.
        
        Args:
            csv_data: String or bytes containing messy CSV data
            
        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected.get('encoding') or 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')
        
        # Remove BOM and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')
        
        # Split into non-empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""
        
        # Detect delimiter and standardize lines
        delimiter = claude37Round5Solution.detect_delimiter(lines)
        standardized_lines = claude37Round5Solution.standardize_delimiters(lines, delimiter)
        
        # Parse CSV rows
        rows = claude37Round5Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows or len(rows) < 1:
            return ""
        
        # Normalize header row
        header = [claude37Round5Solution.normalize_column_name(col) for col in rows[0]]
        
        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue
            
            # Adjust row length to match header
            adjusted_row = claude37Round5Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue
            
            # Clean each field based on content and column name
            cleaned_row = []
            for i, field in enumerate(adjusted_row):
                column_name = header[i] if i < len(header) else f"column_{i}"
                cleaned_row.append(claude37Round5Solution.clean_field(field, column_name))
            
            normalized_rows.append(cleaned_row)
        
        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)
        
        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter by analyzing patterns outside quoted sections.
        
        Args:
            lines: List of CSV data lines
            
        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}
        
        # Analyze first few lines
        for line in lines[:min(10, len(lines))]:
            for delimiter in common_delimiters:
                count = claude37Round5Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)
        
        # Find most consistent delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0
        
        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue
            
            # Score based on consistency and frequency
            counter = Counter(counts)
            most_common, frequency = counter.most_common(1)[0]
            score = frequency * most_common
            
            if score > best_score:
                best_score = score
                best_delimiter = delimiter
        
        # Special case: check for multiple spaces as delimiter
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'
        
        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that appear outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False
        
        for char in line:
            if escaped:
                escaped = False
                continue
                
            if char == '\\':
                escaped = True
                continue
                
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None
            elif char == delimiter and not in_quotes:
                count += 1
        
        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use comma as delimiter and double quotes for quoting.
        
        Args:
            lines: List of CSV data lines
            primary_delimiter: The detected delimiter
            
        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []
        
        for line in lines:
            # Special handling for space-delimited data
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line.strip())
                standardized_lines.append(','.join(f'"{f}"' if ',' in f else f for f in fields))
                continue
                
            new_line = ""
            in_quotes = False
            quote_char = None
            escaped = False
            
            for char in line:
                if escaped:
                    escaped = False
                    new_line += char
                    continue
                    
                if char == '\\':
                    escaped = True
                    new_line += char
                    continue
                    
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char  # Different quote inside quoted text
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char
            
            standardized_lines.append(new_line)
        
        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV lines into rows, handling quotes properly.
        
        Args:
            lines: List of standardized CSV lines
            detected_delimiter: The detected delimiter
            
        Returns:
            List of parsed CSV rows
        """
        csv_text = '\n'.join(lines)
        
        try:
            reader = csv.reader(io.StringIO(csv_text))
            return list(reader)
        except Exception:
            # Fallback to manual parsing if csv module fails
            rows = []
            for line in lines:
                fields = []
                current = ""
                in_quotes = False
                quote_char = None
                escaped = False
                
                for char in line:
                    if escaped:
                        escaped = False
                        current += char
                        continue
                        
                    if char == '\\':
                        escaped = True
                        continue
                        
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current += char
                    elif char == ',' and not in_quotes:
                        fields.append(current)
                        current = ""
                    else:
                        current += char
                
                if current or len(fields) > 0:
                    fields.append(current)
                    rows.append(fields)
            
            return rows

    @staticmethod
    def normalize_column_name(name: str) -> str:
        """
        Normalize column name to lowercase with underscores.
        
        Args:
            name: Column name to normalize
            
        Returns:
            Normalized column name
        """
        # Remove quotes and extra whitespace
        name = name.strip()
        if (name.startswith('"') and name.endswith('"')) or (name.startswith("'") and name.endswith("'")):
            name = name[1:-1].strip()
        
        # Convert to lowercase and replace non-alphanumeric with underscores
        name = re.sub(r'[^\w\s]', '_', name.lower())
        name = re.sub(r'\s+', '_', name)
        
        # Remove consecutive and trailing underscores
        name = re.sub(r'_+', '_', name)
        name = name.strip('_')
        
        return name if name else "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> Optional[List[str]]:
        """
        Adjust row to match expected length by padding or combining fields.
        
        Args:
            row: Row to adjust
            expected_length: Expected number of fields
            
        Returns:
            Adjusted row or None if adjustment is not possible
        """
        if len(row) == expected_length:
            return row
        
        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))
        
        # If row is too long, try to intelligently combine fields
        new_row = []
        i = 0
        
        while i < len(row):
            # If we've reached our target length minus 1, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(" ".join(row[i:]))
                break
            
            field = row[i]
            
            # Check if this field starts with a quote but doesn't end with one
            if ((field.startswith('"') and not field.endswith('"')) or 
                (field.startswith("'") and not field.endswith("'"))):
                # Find matching end quote
                combined = field
                j = i + 1
                quote_char = field[0]
                found_end = False
                
                while j < len(row):
                    combined += "," + row[j]  # Add back the delimiter
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1
                
                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1
        
        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]
        
        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize a field value based on content and column name.
        
        Args:
            field: Field value to clean
            column_name: Name of the column (for type inference)
            
        Returns:
            Cleaned field value
        """
        # Remove outer quotes and trim whitespace
        field = field.strip()
        if len(field) >= 2:
            if (field[0] == field[-1]) and field[0] in ['"', "'"]:
                field = field[1:-1].strip()
        
        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""
        
        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"
        
        # Try to detect and normalize date fields
        if claude37Round5Solution.is_date_column(column_name) or claude37Round5Solution.looks_like_date(field):
            date_obj = claude37Round5Solution.parse_date(field)
            if date_obj:
                return date_obj.strftime('%Y-%m-%d')
        
        # Try to detect and normalize numeric fields
        if claude37Round5Solution.looks_like_numeric(field):
            normalized_number = claude37Round5Solution.format_number(field)
            if normalized_number is not None:
                return normalized_number
        
        return field

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """Check if column name suggests it contains dates."""
        date_indicators = ['date', 'day', 'month', 'year', 'time', 'updated', 'created', 'birth']
        return any(indicator in column_name.lower() for indicator in date_indicators)

    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if field value looks like a date."""
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',    # YYYY-MM-DD
            r'[A-Za-z]{3,9}\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{2,4}',  # Month DD, YYYY
            r'\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]{3,9},?\s+\d{2,4}'   # DD Month YYYY
        ]
        
        return any(re.search(pattern, field) for pattern in date_patterns)

    @staticmethod
    def parse_date(date_str: str) -> Optional[Any]:
        """
        Parse date string to datetime object.
        
        Args:
            date_str: Date string to parse
            
        Returns:
            Datetime object or None if parsing fails
        """
        try:
            # Remove ordinal suffixes (1st, 2nd, 3rd, etc.)
            cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
            return date_parser.parse(cleaned, fuzzy=True)
        except Exception:
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if field looks like it contains a number."""
        # Remove currency symbols and spaces
        cleaned = re.sub(r'[$€£\s]', '', field)
        # Check if it has digits and optionally decimal/thousand separators
        return bool(re.match(r'^[+-]?[\d,.]+$', cleaned))

    @staticmethod
    def format_number(number_str: str) -> Optional[str]:
        """
        Normalize number format by handling different separators.
        
        Args:
            number_str: Number string to normalize
            
        Returns:
            Normalized number string or None if parsing fails
        """
        # Remove currency symbols and spaces
        cleaned = re.sub(r'[$€£\s]', '', number_str)
        
        try:
            # Handle different number formats
            if ',' in cleaned and '.' in cleaned:
                # Determine which is the decimal separator
                if cleaned.rindex(',') > cleaned.rindex('.'):
                    # European format (1.234,56)
                    cleaned = cleaned.replace('.', '').replace(',', '.')
                else:
                    # US format (1,234.56)
                    cleaned = cleaned.replace(',', '')
            elif ',' in cleaned:
                # Could be either 1,234 (US) or 1,23 (European)
                # If comma is followed by exactly 2 digits at the end, likely European decimal
                if re.search(r',\d{2}$', cleaned):
                    cleaned = cleaned.replace(',', '.')
                else:
                    cleaned = cleaned.replace(',', '')
            
            # Try to convert to float to standardize format
            float_value = float(cleaned)
            
            # Format integers without decimals, otherwise use exactly 2 decimal places
            if float_value.is_integer():
                return str(int(float_value))
            else:
                return str(float_value)
        except Exception:
            return number_str
from dateutil import parser as date_parser
from typing import List, Optional, Union
from datetime import datetime

class mistral_largeRound5Solution:
    @staticmethod
    def solve(input_text):
        return mistral_largeRound5Solution.normalize_csv(input_text)
        
    @staticmethod
    def normalize_csv(csv_data: Union[str, bytes]) -> str:
        """
        Clean and normalize messy CSV data.

        Args:
            csv_data: String or bytes containing messy CSV data

        Returns:
            String containing cleaned, normalized CSV data
        """
        # Handle character encoding issues
        if isinstance(csv_data, bytes):
            detected = chardet.detect(csv_data)
            encoding = detected.get('encoding') or 'utf-8'
            csv_data = csv_data.decode(encoding, errors='replace')

        # Remove BOM if present and normalize line endings
        csv_data = csv_data.lstrip('\ufeff').replace('\r\n', '\n').replace('\r', '\n')

        # Split into lines and remove empty lines
        lines = [line for line in csv_data.split('\n') if line.strip()]
        if not lines:
            return ""

        # Detect and standardize delimiters
        delimiter = mistral_largeRound5Solution.detect_delimiter(lines)
        standardized_lines = mistral_largeRound5Solution.standardize_delimiters(lines, delimiter)

        # Parse CSV with detected delimiter
        rows = mistral_largeRound5Solution.parse_csv_rows(standardized_lines, delimiter)
        if not rows:
            return ""

        # Normalize header row
        header = [mistral_largeRound5Solution.normalize_column_name(col) for col in rows[0]]

        # Process data rows
        normalized_rows = [header]
        for row in rows[1:]:
            # Skip completely empty rows
            if not row or all(not cell.strip() for cell in row):
                continue

            # Adjust row length to match header
            adjusted_row = mistral_largeRound5Solution.adjust_row_length(row, len(header))
            if not adjusted_row:
                continue

            # Clean each field
            normalized_row = [
                mistral_largeRound5Solution.clean_field(field, header[i] if i < len(header) else f"column_{i}")
                for i, field in enumerate(adjusted_row)
            ]

            normalized_rows.append(normalized_row)

        # Write normalized data back to CSV
        output = io.StringIO()
        writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        writer.writerows(normalized_rows)

        return output.getvalue()

    @staticmethod
    def detect_delimiter(lines: List[str]) -> str:
        """
        Detect the most likely delimiter in CSV data by analyzing patterns outside quoted sections.

        Args:
            lines: List of CSV data lines

        Returns:
            The most likely delimiter character
        """
        common_delimiters = [',', ';', '\t', '|']
        delimiter_counts: Dict[str, List[int]] = {d: [] for d in common_delimiters}

        for line in lines[:min(10, len(lines))]:  # Check first 10 lines or fewer
            for delimiter in common_delimiters:
                count = mistral_largeRound5Solution.count_delimiters_outside_quotes(line, delimiter)
                if count > 0:
                    delimiter_counts[delimiter].append(count)

        # Find most consistent non-zero delimiter
        best_delimiter = ','  # Default to comma
        best_score = 0

        for delimiter, counts in delimiter_counts.items():
            if not counts:
                continue

            # Calculate consistency (how many lines have the same count)
            count_frequencies = Counter(counts)
            most_common_count, frequency = count_frequencies.most_common(1)[0]
            consistency_score = frequency * most_common_count  # Weight by both frequency and count

            if consistency_score > best_score:
                best_delimiter = delimiter
                best_score = consistency_score

        # Special case: if no clear delimiter is found, check for multiple spaces
        if best_score == 0:
            for line in lines[:min(5, len(lines))]:
                if re.search(r'\s{2,}', line):
                    return r'\s+'

        return best_delimiter

    @staticmethod
    def count_delimiters_outside_quotes(line: str, delimiter: str) -> int:
        """Count delimiters that are outside of quoted sections."""
        count = 0
        in_quotes = False
        quote_char = None
        escaped = False

        for i, char in enumerate(line):
            # Handle escape sequences
            if escaped:
                escaped = False
                continue

            if char == '\\':
                escaped = True
                continue

            # Toggle quote state
            if char in ['"', "'"]:
                if not in_quotes:
                    in_quotes = True
                    quote_char = char
                elif char == quote_char:
                    in_quotes = False
                    quote_char = None

            # Count delimiter if outside quotes
            elif char == delimiter and not in_quotes:
                count += 1

        return count

    @staticmethod
    def standardize_delimiters(lines: List[str], primary_delimiter: str) -> List[str]:
        """
        Standardize all lines to use the primary delimiter.

        Args:
            lines: List of CSV data lines
            primary_delimiter: The delimiter to standardize to

        Returns:
            List of standardized CSV lines
        """
        standardized_lines = []

        for line in lines:
            # Handle space-delimited lines
            if primary_delimiter == r'\s+' and not any(d in line for d in [',', ';', '\t', '|']):
                fields = re.split(r'\s{2,}', line)
                standardized_lines.append(','.join(fields))
                continue

            # For other delimiters, process quotes properly
            new_line = ""
            in_quotes = False
            quote_char = None

            for char in line:
                if char in ['"', "'"]:
                    if not in_quotes:
                        in_quotes = True
                        quote_char = char
                        new_line += '"'  # Standardize to double quotes
                    elif char == quote_char:
                        in_quotes = False
                        quote_char = None
                        new_line += '"'  # Standardize to double quotes
                    else:
                        new_line += char  # Different quote inside quoted text
                elif char in [',', ';', '\t', '|'] and not in_quotes:
                    new_line += ','  # Standardize to comma
                else:
                    new_line += char

            standardized_lines.append(new_line)

        return standardized_lines

    @staticmethod
    def parse_csv_rows(lines: List[str], detected_delimiter: str) -> List[List[str]]:
        """
        Parse CSV lines into rows, handling mixed quote styles and other issues.

        Args:
            lines: List of standardized CSV lines
            detected_delimiter: The primary delimiter used in the data

        Returns:
            List of parsed CSV rows
        """
        # Join lines back into a single string
        csv_text = '\n'.join(lines)

        # Use the correct delimiter for parsing
        actual_delimiter = ',' if detected_delimiter == r'\s+' else detected_delimiter

        try:
            # Try parsing with csv module
            reader = csv.reader(io.StringIO(csv_text), delimiter=actual_delimiter)
            rows = list(reader)

            # Post-process to clean up quotes and whitespace
            clean_rows = []
            for row in rows:
                clean_row = []
                for field in row:
                    field = field.strip()
                    # Remove matching outer quotes if present
                    if (field.startswith('"') and field.endswith('"')) or \
                       (field.startswith("'") and field.endswith("'")):
                        field = field[1:-1].strip()
                    clean_row.append(field)
                clean_rows.append(clean_row)

            return clean_rows
        except Exception:
            # Fallback: manual parsing
            rows = []
            for line in lines:
                fields = []
                current_field = ""
                in_quotes = False
                quote_char = None

                for char in line:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                            quote_char = None
                        else:
                            current_field += char
                    elif char == actual_delimiter and not in_quotes:
                        fields.append(current_field.strip())
                        current_field = ""
                    else:
                        current_field += char

                fields.append(current_field.strip())
                rows.append(fields)

            return rows

    @staticmethod
    def normalize_column_name(column: str) -> str:
        """
        Normalize column name to lowercase with underscores.

        Args:
            column: The column name to normalize

        Returns:
            Normalized column name
        """
        # Remove outer quotes if present
        column = column.strip()
        if (column.startswith('"') and column.endswith('"')) or \
           (column.startswith("'") and column.endswith("'")):
            column = column[1:-1].strip()

        # Replace non-alphanumeric with underscores
        normalized = re.sub(r'[^\w\s]', '_', column)
        # Replace whitespace with underscores and convert to lowercase
        normalized = re.sub(r'\s+', '_', normalized).lower()
        # Remove consecutive underscores
        normalized = re.sub(r'_+', '_', normalized)
        # Remove leading/trailing underscores
        normalized = normalized.strip('_')

        return normalized or "column"  # Ensure we don't return empty string

    @staticmethod
    def adjust_row_length(row: List[str], expected_length: int) -> Optional[List[str]]:
        """
        Adjust row to match expected length.

        Args:
            row: The row to adjust
            expected_length: The expected number of fields

        Returns:
            Adjusted row or None if adjustment is not possible
        """
        if len(row) == expected_length:
            return row

        # If row is too short, pad with empty strings
        if len(row) < expected_length:
            return row + [""] * (expected_length - len(row))

        # If row is too long, try to combine fields that might have been incorrectly split
        new_row = []
        i = 0
        while i < len(row):
            field = row[i]

            # If we've already reached expected length, combine all remaining fields
            if len(new_row) == expected_length - 1:
                new_row.append(' '.join(row[i:]))
                break

            # Check if this field starts with a quote but doesn't end with one
            if (field.startswith('"') and not field.endswith('"')) or \
               (field.startswith("'") and not field.endswith("'")):
                # Find the matching end quote
                combined = field
                j = i + 1
                found_end = False
                quote_char = field[0]

                while j < len(row):
                    combined += "," + row[j]  # Add back the comma that was removed during parsing
                    if row[j].endswith(quote_char):
                        found_end = True
                        break
                    j += 1

                if found_end:
                    new_row.append(combined)
                    i = j + 1
                else:
                    new_row.append(field)
                    i += 1
            else:
                new_row.append(field)
                i += 1

        # If we still have too many fields, truncate
        if len(new_row) > expected_length:
            return new_row[:expected_length]

        return new_row

    @staticmethod
    def clean_field(field: str, column_name: str) -> str:
        """
        Clean and normalize field value based on content and column name.

        Args:
            field: The field value to clean
            column_name: The name of the column this field belongs to

        Returns:
            Cleaned field value
        """
        # Trim whitespace and remove outer quotes
        field = field.strip()
        if (field.startswith('"') and field.endswith('"')) or \
           (field.startswith("'") and field.endswith("'")):
            field = field[1:-1].strip()

        # Handle empty or null-like values
        if not field or field.lower() in ['null', 'none', 'na', 'n/a', '-']:
            return ""

        # Normalize boolean values
        if field.lower() in ['true', 'yes', 'y', '1']:
            return "true"
        if field.lower() in ['false', 'no', 'n', '0']:
            return "false"

        # Try to detect and normalize date fields
        if mistral_largeRound5Solution.looks_like_date(field) or mistral_largeRound5Solution.is_date_column(column_name):
            try:
                date_obj = mistral_largeRound5Solution.parse_date(field)
                if date_obj:
                    return date_obj.strftime('%Y-%m-%d')
            except Exception:
                pass  # If date parsing fails, continue with other cleaning

        # Try to detect and normalize numeric fields
        if mistral_largeRound5Solution.looks_like_numeric(field):
            try:
                return mistral_largeRound5Solution.format_number(field)
            except Exception:
                pass  # If number parsing fails, return original field

        return field
    
    @staticmethod
    def looks_like_date(field: str) -> bool:
        """Check if a field value looks like a date."""
        # Date patterns: YYYY-MM-DD, MM/DD/YYYY, DD-MM-YYYY, etc.
        date_patterns = [
            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',  # YYYY-MM-DD
            r'\d{1,2}[-/]\d{1,2}[-/]\d{4}',  # MM-DD-YYYY or DD-MM-YYYY
            r'\d{1,2}[-/]\d{1,2}[-/]\d{2}',  # MM-DD-YY or DD-MM-YY
            r'[A-Za-z]{3,} \d{1,2},? \d{4}',  # Month DD, YYYY
            r'\d{1,2} [A-Za-z]{3,},? \d{4}'   # DD Month YYYY
        ]
        
        for pattern in date_patterns:
            if re.search(pattern, field):
                return True
        return False

    @staticmethod
    def is_date_column(column_name: str) -> bool:
        """Check if a column name suggests it contains dates."""
        date_keywords = ['date', 'day', 'month', 'year', 'dob', 'created', 'updated', 
                         'timestamp', 'time', 'birthday', 'birth']
        return any(keyword in column_name.lower() for keyword in date_keywords)

    @staticmethod
    def parse_date(date_str: str) -> Optional[datetime]:
        """Parse a date string into a datetime object."""
        try:
            return date_parser.parse(date_str, fuzzy=True)
        except (ValueError, OverflowError):
            return None

    @staticmethod
    def looks_like_numeric(field: str) -> bool:
        """Check if a field value looks like a number."""
        # Remove common number formatting characters
        cleaned = re.sub(r'[,$%()]', '', field).strip()
        
        # Check if it's a decimal number
        if re.fullmatch(r'-?\d+(\.\d+)?', cleaned):
            return True
            
        # Check for scientific notation
        if re.fullmatch(r'-?\d+(\.\d+)?[eE][+-]?\d+', cleaned):
            return True
            
        return False

    @staticmethod
    def format_number(field: str) -> str:
        """Format numeric values consistently."""
        # Remove currency symbols, commas, parentheses (negative numbers)
        cleaned = re.sub(r'[,$%]', '', field)
        cleaned = cleaned.replace('(', '-').replace(')', '')
        
        # Convert to float and back to string to normalize
        try:
            num = float(cleaned)
            # Format integers without decimal point
            if num.is_integer():
                return str(int(num))
            else:
                # Remove trailing zeros but keep significant decimals
                return str(num).rstrip('0').rstrip('.') if '.' in str(num) else str(num)
        except ValueError:
            return field

def main():
    """Test all solutions and collect metrics"""
    parser = argparse.ArgumentParser(description="Test LLM tournament solutions")
    parser.add_argument("--input", type=str, required=True, help="Input file to test on")
    parser.add_argument("--output-dir", type=str, default="output_results_for_each_round_and_model",
                    help="Directory for results")
    args = parser.parse_args()
    
    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Read the input file
    with open(args.input, "r", encoding="utf-8") as f:
        input_text = f.read()
    
    def count_lines(text: str) -> int:
        """Count the number of lines in a text"""
        return len(text.splitlines())

    def count_chars(text: str) -> int:
        """Count the number of characters in a text"""
        return len(text)        
    
    # Get input file metrics
    input_lines = count_lines(input_text)
    input_chars = count_chars(input_text)
    print(f"Input file: {args.input}")
    print(f"Input lines: {input_lines}")
    print(f"Input chars: {input_chars}")
    
    # List of all solution classes
    solution_classes = [
        o3_miniRound0Solution, gpt4oRound0Solution, claude37Round0Solution, mistral_largeRound0Solution, o3_miniRound1Solution, gpt4oRound1Solution, claude37Round1Solution, mistral_largeRound1Solution, o3_miniRound2Solution, gpt4oRound2Solution, claude37Round2Solution, mistral_largeRound2Solution, o3_miniRound3Solution, gpt4oRound3Solution, claude37Round3Solution, mistral_largeRound3Solution, o3_miniRound4Solution, gpt4oRound4Solution, claude37Round4Solution, mistral_largeRound4Solution, o3_miniRound5Solution, gpt4oRound5Solution, claude37Round5Solution, mistral_largeRound5Solution
    ]
    
    # Test each solution
    metrics = []
    
    for solution_class in solution_classes:
        class_name = solution_class.__name__
        print(f"\nTesting {class_name}...")

        # Extract model name and round number
        parts = class_name.split("Round")
        model_name = parts[0].replace("_", "-").lower()
        round_num = parts[1].split("Solution")[0]
        
        try:
            # Apply the solution
            start_time = time.time()
            result = solution_class.solve(input_text)
            execution_time = time.time() - start_time
            
            # Save the result
            output_filename = f"sample_file_output__{model_name}_round_{round_num}.md"
            output_path = output_dir / output_filename
            
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(result)
                
            # Calculate metrics
            output_lines = count_lines(result)
            output_chars = count_chars(result)
            output_size_kb = len(result) / 1024
            
            # Store metrics
            solution_metrics = {
                "model": model_name,
                "round": round_num,
                "execution_time": round(execution_time, 2),
                "output_lines": output_lines,
                "output_chars": output_chars,
                "output_size_kb": round(output_size_kb, 2),
                "lines_ratio": round(output_lines / input_lines, 2),
                "chars_ratio": round(output_chars / input_chars, 2)
            }
            
            metrics.append(solution_metrics)
            
            # Print metrics
            print(f"  Execution time: {solution_metrics['execution_time']}s")
            print(f"  Output lines: {output_lines}")
            print(f"  Output size: {solution_metrics['output_size_kb']} KB")
            print(f"  Output saved to: {output_path}")
            
        except Exception as e:
            print(f"  Error testing {class_name}: {str(e)}")

    # Save metrics
    metrics_path = output_dir.parent / "metrics" / "test_metrics.json"
    os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
    
    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)
        
    print(f"\nMetrics saved to: {metrics_path}")
    
if __name__ == "__main__":
    main()