{ "dataset": "campaign_booklet", "name": "South Korean Election Campaign Booklets", "description": "Original krpoltext campaign booklet corpus artifact covering 49,678 document rows from South Korean presidential, National Assembly, and local elections, 2000-2022.", "time_coverage": "2000-2022", "data_version": "v2022", "package_version": "0.2.0", "variant": "original", "default_variant": "original", "available_variants": ["original", "enriched"], "variant_description": "The original krpoltext campaign booklet corpus artifact.", "recommended_use": "General corpus analysis and backward-compatible workflows.", "identifier_columns": "code", "text_columns": ["text", "filtered"], "supported_formats": ["csv", "parquet"], "managed_formats": ["csv", "parquet"], "artifacts": { "csv": { "format": "csv", "file": "sk_election_campaign_booklet_v2022.csv", "download_url": "https://osf.io/download/6ybj8/", "sha256": "6ce6f40f5358829b167109d9ca9195e5089d2c6d05a61ad1c1925e424f55021d", "size_bytes": 756245336, "managed": true }, "parquet": { "format": "parquet", "file": "sk_election_campaign_booklet_v2022.parquet", "download_url": "https://osf.io/download/pxg2k/", "sha256": "a291a887d157963cffcffbe2c1ad60333222dd479bf4b01e90cec3a28d5c19a6", "size_bytes": 406524268, "managed": true } }, "columns": [ { "name": "date", "type": "character", "description": "Election date (YYYY-MM-DD)" }, { "name": "name", "type": "character", "description": "Candidate name (Korean)" }, { "name": "region", "type": "character", "description": "Metropolitan region (province or metropolitan city)" }, { "name": "district", "type": "character", "description": "Electoral district" }, { "name": "office_id", "type": "integer", "description": "Office type identifier (1=president, 2=national_assembly, 3=edu_superintendent, 4=metro_head, 5=metro_assembly, 6=basic_head, 7=basic_assembly)" }, { "name": "office", "type": "character", "description": "Office type label (president, national_assembly, edu_superintendent, metro_head, metro_assembly, basic_head, basic_assembly)" }, { "name": "giho", "type": "integer", "description": "Candidate ballot number" }, { "name": "party", "type": "character", "description": "Political party name (Korean)" }, { "name": "party_eng", "type": "character", "description": "Political party name (English); transliteration if no official English name" }, { "name": "result", "type": "character", "description": "Election result in Korean" }, { "name": "sex", "type": "character", "description": "Sex in Korean" }, { "name": "birthday", "type": "character", "description": "Date of birth (YYYY-MM-DD)" }, { "name": "age", "type": "integer", "description": "Age at the time of the election" }, { "name": "job_id", "type": "integer", "description": "Original NEC job category identifier (varies across years)" }, { "name": "job", "type": "character", "description": "Standardized job category (Korean)" }, { "name": "job_name", "type": "character", "description": "Job title (Korean)" }, { "name": "job_name_eng", "type": "character", "description": "Job title (English)" }, { "name": "job_code", "type": "integer", "description": "Standardized job code consistent across years" }, { "name": "edu_id", "type": "integer", "description": "Original NEC education level identifier (varies across years)" }, { "name": "edu", "type": "character", "description": "Education description (Korean, free-text from NEC)" }, { "name": "edu_name", "type": "character", "description": "Standardized education level label (Korean)" }, { "name": "edu_name_eng", "type": "character", "description": "Standardized education level label (English)" }, { "name": "edu_code", "type": "integer", "description": "Standardized education code consistent across years" }, { "name": "career1", "type": "character", "description": "Career description 1" }, { "name": "career2", "type": "character", "description": "Career description 2" }, { "name": "pages", "type": "integer", "description": "Number of pages in the booklet" }, { "name": "code", "type": "character", "description": "krpoltext document row identifier", "identifier": true }, { "name": "sex_code", "type": "integer", "description": "Sex code: 1 = male, 0 = female" }, { "name": "result_code", "type": "integer", "description": "Result code: 1 = elected, 0 = not elected" }, { "name": "text", "type": "character", "description": "Full OCR-extracted text of the campaign booklet" }, { "name": "filtered", "type": "character", "description": "Parsed text after morphological analysis; Korean-only, numbers, foreign characters, and symbols removed" } ], "notes": { "missing_values": "2,283 rows have no booklet code or text because a booklet was not available. 151 are missing biographical information. 23 booklets were unprocessable.", "text_processing": "All text is UTF-8 encoded Korean. 'text' contains the full original text; 'filtered' contains the morphologically parsed version.", "identifiers": "'code' is the krpoltext document row identifier, but some original rows have missing code values, so row identity should not be inferred from code alone. 'job_id' and 'edu_id' vary across election years; use 'job_code' and 'edu_code' for cross-year analysis.", "provenance": "The original variant is the source corpus artifact distributed without NEC linkage fields." }, "extras": { "office_mapping": [ { "office_id": 1, "office": "president", "description": "Presidential election" }, { "office_id": 2, "office": "national_assembly", "description": "National Assembly election" }, { "office_id": 3, "office": "edu_superintendent", "description": "Education superintendent" }, { "office_id": 4, "office": "metro_head", "description": "Metropolitan city mayor / provincial governor" }, { "office_id": 5, "office": "metro_assembly", "description": "Metropolitan assembly member" }, { "office_id": 6, "office": "basic_head", "description": "Basic local government head" }, { "office_id": 7, "office": "basic_assembly", "description": "Basic assembly member" } ], "row_universe": "Original campaign_booklet CSV source artifact." } }