<!DOCTYPE html><html lang="ja"><head><meta charset="utf-8">
<title>Character Model for the World Wide Web 1.0: Fundamentals （日本語訳）</title>

<link rel="stylesheet" href="common.css" type="text/css" />

<style>

div.example {
	margin-left: 2em;
	margin-top: 1em;
}


/* 適合性 要件 */
div.req {
	margin-top: 0.5em;
	background: #ffffcc;
}

	/* 要件の見出し */
h5 {
	margin: 0;
	font-size: 1em;
}

h5 > i {
	margin-left: 0.3em;
}
	/* 要件の内容 */
div.req > p {
	margin-top: 0.5em;
	text-indent: 1em;
}


p.example,
p.note {
	margin-left: 2em;
}

	/* 表 */
table {
	margin-left:auto;
	margin-right:auto;
	min-width:35em;
	border: solid black 1px;
}
td, th {
	text-align:center;
	border: solid black 1px;
}

caption {
	caption-side: bottom;
	text-align: left;
	color: #36B;
	font-style: italic;
}
/* from: table summary */
span.summary {
	font-size: small;
}

q {
font-style: italic;
}

/* [仕様][実装][内容] */
i {
	color:#005A9C;
}


@media print {
	.req { background: #ffcc99 }
}

samp,
span.qchar {
	color: #802;
}

samp {
	font-family: inherit; /* 等幅フォントを避ける */
}

span.qchar::before,
span.qchar::after {
	content: '"';
	color:black;
}

kbd > kbd::before {
	content:'<';
	color: gray;
}
kbd > kbd::after {
	content:'>';
	color: gray;
}
</style>

<script src="common0a.js" ></script>
<script src="common1.js" async></script>


<script>

//del_j(); 検証済み 2005-02-15 spec


function switch_words(level){
	var switcher = Util.word_switcher;
	switcher.main_id = 'MAIN';//once
	switcher.switchWords(level);
	E('words' + switcher.level).checked = true;
}


/* 付録 C 適合性基準一覧の展開 */
function expand_checklist(){
	var parent = E('_checklist_');
	if(parent.firstElementChild){
		parent.textContent = '';
		return;
	}

	repeat('div.req', function(div){
		div = div.cloneNode(true);
		var h5 = div.firstElementChild;
		var a = C('a');
		a.href = '#' + h5.id;
		h5.removeAttribute('id');
		a.appendChild(h5.firstChild);
		h5.insertBefore(a, h5.firstChild);
		parent.appendChild(div);
	})
}

</script>

<script>

/*
マウスが重なった語に動的にリンクを付与する
要サポート
	caretPositionFromPoint [CSSOM View] 
	surroundContents [DOM Range]
	bind
*/

Util.DEFERRED.push(function(){
	E('words0').checked = true;
	wordLinknizer.init();
});


var wordLinknizer = {
	anchor: null,
	dfn_map : {},
	unicode_map : {},
	tags_excluded :{B:1, DFN:1, A:1, H1:1, H2:1, H3:1, H4:1},
	x: 0,
	y: 0,
	timer: 0,
	range: null,

	init: function(){
		var link_data = Util.parseBlocks(E('_link_data').textContent);

		var umap = 
		this.unicode_map = Util.get_mapping(link_data.unicode_terms);
		var map = this.dfn_map;
		for(var key in umap){
			map[key] = 1;
		}
		repeat('dfn', function(dfn){
			var id = dfn.id, text = dfn.textContent;
			if(id && text){
				map[text] = dfn.id;
			}
			if(text in umap){
				var a = C('a');
				a.href = 'https://www.unicode.org/glossary/#' + umap[text];
				a.textContent = dfn.textContent;
				dfn.textContent = '';
				dfn.appendChild(a);
			}
		});
		if(!document.caretPositionFromPoint) {
			wordLinknizer = null;
			return;
		}
		
		Util.get_mapping( link_data.additional_terms, map);

		this.anchor = C('a');
		this.range = document.createRange();
		this.handle_mouse = this.handle_mouse.bind(this);
		E('MAIN').addEventListener('mousemove', this, false);
	},
	handleEvent: function(event){
		var timer = this.timer;
		if(timer > 0) window.clearTimeout(timer);
		this.timer = 0;
		this.timer = window.setTimeout(this.handle_mouse, 50);
		this.x = event.clientX, this.y = event.clientY;
	},


	handle_mouse: function(){
		var caret = document.caretPositionFromPoint(this.x, this.y);
		if(!caret) return;
		this.enclose(caret);
	},

	enclose: function(caret){
		if(!caret) return null;
		var node = caret.offsetNode;
		if(!node || (node.nodeType !== Node.TEXT_NODE)) return null;
		var parent = node.parentNode;
		if(
			(parent === this.anchor) ||
			(parent.tagName in this.tags_excluded)
		) return null;

		var offset = caret.offset
		var text = node.data;
		var rxp = /([\u4E00-\u9FFF]+|[\u30A1-\u30F4ー]+|[\w\-]+)/g;
		var m, last_index = 0, key;
		while(m = rxp.exec(text)){
			last_index = rxp.lastIndex;
			if(last_index < offset) continue;
			key = m[0];
			if(offset < last_index - key.length) return;
			break;
		};
		if(!m) return;

		var map = this.dfn_map;
		var id;
		var start = last_index - key.length, end = last_index;
A:
		{
			var fix = text.slice(0, start)
				.match(/([\u4E00-\u9FFF]+|[\u30A1-\u30F4ー]+|\w+) ?$/);
			if(fix){
				fix = fix[0] + key;
				if(fix in map) {
					key  = fix;
					start = end - key.length;
					break A;
				}
			}
			var fix = text.slice(end)
				.match(/^ ?([\u4E00-\u9FFF]+|[\u30A1-\u30F4ー]+|\w+)/);
			if(fix) {
				fix = key + fix[0];
				if(fix in map) {
					key  = fix;
					end = start + key.length;
					break A;
				}
			}
			if(! (key in map)) return;
		};
		this.range.setStart(node, start);
		this.range.setEnd(node, end);
		this.linknize(key);
	},
	linknize: function(key){
		var anchor = this.anchor;
		if(anchor.parentNode && anchor.firstChild){
			anchor.parentNode.insertBefore(anchor.firstChild, anchor);
		}

		var id = this.dfn_map[key];
		if(id === 1){
			if(!(key in this.unicode_map)) return;
			anchor.href = 
				'https://www.unicode.org/glossary/#' + this.unicode_map[key];
		} else {
			anchor.href = '#' + id;
		}
		this.range.surroundContents(anchor);
	}
}

</script>

<!-- 

UNICODE 用語定義リンク
https://www.unicode.org/glossary/

charset レジストリ
-->

<script type="text/plain" id="_link_data">

●●unicode_terms
アクセントマーク:accent_mark
書記素クラスタ:grapheme_cluster
	子音クラスタ:consonant_cluster
	クラスタ
グリフ:glyph
フォント:font
文字符号化スキーム:character_encoding_scheme
セディーユ:cedilla
ダイアクリティカルマーク:diacritic
バイト:byte
プレーンテキスト:plain_text
レパートリ:character_repertoire
	repertoire
異体字:y_variant
	異体字:z_variant
音節:syllable
音素:phoneme
基底文字:base_character
基本多言語面:basic_multilingual_plane
互換等価:compatibility_equivalent
互換文字:compatibility_character
合字:ligature
	子音:consonant
私用符号位置:private_use_code_point
私用領域:private_use_area
私用:private_use
視覚順:visual_order
字:letter
書記素:grapheme
書式文字:format_character
照合:collation
Unicode 照合:unicode_collation_algorithm
	Unicode 照合アルゴリズム
正規化:normalization
代用符号位置:surrogate_code_point
bicameral:bicameral
	文字の大小の区別がある:bicameral
	大文字・小文字の区別がある:bicameral

等価性:equivalence
非文字:noncharacter
表記体系:writing_system
符号位置:code_point
Unicode 符号化形式:unicode_encoding_form
UTF-16:UTF_16
UTF-8:UTF_8
UTF-32:UTF_32
US-ASCII:ASCII
符号化形式:character_encoding_form
文字符号化形式:character_encoding_form
符号化文字集合:coded_character_set
	符号化法:encoding
符号単位:code_unit
符号変換:transcoding
	符号変換器:transcoder
文字:character
用字系:script
文字名:character_name
	文字列:string
文脈に応じた変形:contextual_variant
平仮名:hiragana
片仮名:katakana
ハングル:hangul
ヴィラーマ:virama
母音:vowel
論理順:logical_order
	論理単位:logical unit
文字集合:character_set
双方向:BIDI
合成済:precomposed_character
分解済:decomposable_character
対称交換:symmetric_swapping
ビッグエンディアン:big_endian
結合文字:combining_character
結合マーク:combining_mark
抽象文字:abstract_character
符号化文字:encoded_character
CCS:ccs
CEF:cef
CES:ces
PUA:pua
IANA:IANA
アルファベット:alphabet
HTML:HTML
XML:XML
プロトコル:higher_level_protocol
描画:rendering
レンダリング:rendering
UCS:UCS
Unicode 技術報告:unicode_technical_report
	tailorable
	バイト順:byte_order_mark
漢字:kanji
	endian
大小:case
大文字:uppercase
小文字:lowercase

●●additional_terms
論理的選択:logical-selection-mode
視覚的選択:visual-selection-mode
文字列:sec-Strings
付番:sec-stringIndexing
符号化法:character-encoding
選定:sec-Encodings
エスケープ法:escaping
エスケープ:def-char-escape
	識別:sec-EncodingIdent

</script>

<script type="text/plain" id="_source_data">


●●options

spec_date:2005-02-15
trans_update:2014-03-30
original_url:http://www.w3.org/TR/2005/REC-charmod-20050215/
spec_status:REC
main:MAIN
toc:contents
no_original_dfn:true
trans_1st_pub:2013-05-20

●●words_table


	（ UNICODE 用語）
アクセントマーク:accent
クラスタ:cluster
グリフ:glyph
文字符号化:character encoding
スキーム:scheme
セディーユ:cedilla
ダイアクリティカルマーク:diacritics
バイト:byte
プレーンテキスト:plain-text
レパートリ:repertoire
異体字:variants
音節:syllable
	音節:syllabic
音素:phoneme
基底文字:base character
基本多言語面:Basic Multilingual Plane
互換等価:compatibility equivalents
互換文字:compatibility character
合字:ligature
子音:consonant
私用符号位置:private use code point
私用領域:private use area
視覚順:visual order
字:letter
書記素:grapheme
書式文字:formatting character
照合:collation
正規化:normalization
代用符号位置:surrogate code points
	文字の大小の区別がある:bicameral
	大文字・小文字の区別がある:bicameral
等価性:equivalence
非文字:noncharacters
表記体系:writing system
符号位置:code point
符号化形式:encoding form
文字符号化形式:character encoding form
符号化文字集合:coded character set
符号化文字:encoded character
符号化法:encoding
符号単位:code unit
符号変換:transcoding
符号変換器:transcoder
文字:character
用字系:script
文字名:character name
文字列:string
	文脈に応じた変形:contextual variants
平仮名:hiragana
片仮名:katakana
ハングル:Hangul
ヴィラーマ:virama
母音:vowel
論理順:logical order
論理単位:logical unit
文字集合:character set
双方向:bidirectional
結合:combining
合成済:precomposed
分解済:decomposed
対称交換:symmetric swapping
ビッグエンディアン:big-endian
結合文字:combining character
抽象文字:abstract character
私用:private use
	（ UNICODE 用語ここまで）

表示異体:display variants
タイプ:type
大小:case
抽象:abstract
文字符号化法:character encoding
符号化処理:encoding process
文字文字列:character string
符号化単位:units of encoding
文字列内:string 内
入力文字:input string
私的合意:private agreement
照合単位:collation unit
論理的選択:logical selection
視覚的選択:visual selection
論理的:logical
視覚的:visual
視覚:visual
論理範囲:logical 範囲:logical range
ウェブ:Web
サイト:site
テキストデータ:text data
メモリ:memory
インタフェース:interface
スクリーン:screen
ビット:bit
オクテット:octet
アドバイス:advice
アクセス性:accessibility
アプリケーション:application
アルゴリズム:algorithm
アンエスケープ:unescaping
インスタンス:instance
	インデックス:indices
エスケープ:escape
エスケープ法:escaping
オブジェクト:object
キーストローク:keystroke
キーボード:keyboard
キー:key
キャレット:caret
テキストデータオブジェクト:textual data object
データ:data
パラメタ:parameter
データオブジェクト:data object
デッドキー:dead-key
テキスト:text
ドライバ:driver
バージョン:version
コンポーネント:component
サイズ:size
セクション:section
プログラム:program
アーキテクチャ:architecture
パタン:pattern
モード:mode
モデル:model
基準処理:reference processing
マークアップ:markup
ロケール:locale
コメント:comment
コンピュータ:computer
コンピュータ利用:computing

ソフトウェア:software
サポート:support
ジェスチャ:gesture
システム:system
ファイルシステム:file system
ファイル:file
プロセッサ:processor
スペース:space
バッファ:buffer
経験則:heuristics
プラットフォーム:platform
プログラミング:programming
プロトコル:protocol
フォント:font
発音体系:phonetics
演算:operation
付番:indexing
音声言語:spoken language
言語:language
グループ:group
マウス:mouse
メッセージ:message
グラフィック:graphic
ラベル:label
リソース:resource
レジストリ:registry
レンダリング:rendering
デジタル:digital
デザイン:design

コミュニティ:community
パス:pass
意味論:semantics
解析対象実体:parsed entities
外部解析対象実体:external parsed entities
外部実体:external entities
型:type
	形:form
データ形式:format
	形式:format
データ型:data 型:datatype

記号:symbol
起点:origin
互換性:compatibility
符号単位文字列:code unit string
単位:unit
格納単位:格納 unit
標準:standard
部分文字列:substring
適合-:conform
	conforming
適合性:conformance
適合性基準:conformance criteria
判定基準:criteria
基準:reference

同一性:identity
入力:input

比較:comparison
符号化:encode
変換:convert
規範的:normative
疑似属性:pseudo-attribute
区切子:delimiter
構文:syntax
検索:search
特定的:specific
整列:sort
構文解析器:parser
構文解析:parse
構文上:syntactic
構文文字:syntax-significant
生成規則:Production
素子:device
層:layer
相互運用性:interoperability
相互作用:interaction
総称的:generic
多段階:multi-level
選択:select
競合:conflict
公式化:formulation
公式的:formal
項目:entry
声音:sound
知覚:perception
知覚-:perceive
抽象化:abstraction
抽象的:abstract
特性:property
識別子:identifier
実体:entity
不可分:atomic
数値文字参照:numeric character references
世界共通:universal
	ユニバーサル
論理:logical
要素:element
別名:alias
不透明:opaque
不一致:inconsistencies
合致検出:matching
視覚表示:visual display
既定:default
描画:rendering
描画-:render

切替:switching

結合マーク:combining mark
切替スキーム:switching scheme
直列化スキーム:serialization scheme
テキスト内容:textual 内容:textual content

バイト順:byte order
エスケープ形:escaped form

	音節クラスタ
	双方向テキスト
	書記素クラスタ:grapheme cluster
	修飾キー
	利用者インタフェース
	処理パス
	基準処理モデル
	文字エスケープ
	文字符号化スキーム
	
	符号化法タグ
	バイト文字列




スペイン語::Spanish
スロヴァキア語::Slovak
タイ語::Thai
ドイツ語::German
アラビア語::Arabic
インド語派::Indic
カナダ・フランス語::French-Canadian
英語::English
フランス語::French
ヘブライ語::Hebrew
ロシア語::Russian
韓国語::Korean
中国語::Chinese
日本語::Japanese
ペルシャ語::Persian
タミル語::Tamil
スウェーデン語::Swedish

頭字::initial
中字::median
中字形::medial form
尾字形::final form
尾字::final
大文字::capital
大文字::upper case
小文字::lower case

転送::transfer
伝送::transmit
伝達::transmission

指針::guideline
交換::interchange
格納::store
格納域::storage area
格納単位::units of storage
物理的::physical
文書::document
文脈::context
文脈下::context 下
要件::requirement
仕様::specification
実装::implementation 
実装-:実装:implement
実装者::implementer
内容::content
開発者::developer
内容開発者::content developer
利用者::user
末端利用者::end user
語源::word root
宣言::declaration
国際的::international
国際標準::International Standard
混同::confusion
混同-:混同:confuse
情報::information
勧告::recommendation
環境::environment
改行::line-break
予約::reserve
未登録::unregistered
登録-:登録:register
規約::convention
記述::description
受容::accept
処理::processing
処理-::process
対話::interaction
整数::integer
識別::identification
識別-:識別:identify
順序::order
単語::word
私的::private
構成子::constructs
解釈-::interpret
解釈::interpretation
概念::concept
認識概念::notion
間接的::indirect
判別::distinguish
公式::official
参照文献::bibliography
参照文献1:参照文献:reference

対応関係::mapping
視野::scope
複数バイト:multibyte
曖昧照合::fuzzy matching
構築::construction
構築-::construct
値::value
受信::receive
受信側::receive 側
修飾キー::modifier key
サーバ::server
プロキシ::proxy
クライアント::client
カット::cut
ペースト::paste
リンク::link
リスト::list
アーカイブ::archive
アクセス::access
アプローチ::approach
アルファベット::alphabetic approach
ストリーム::stream
セミコロン::semicolon
タグ::tag
タグ付け::tagging
テキストの::textual
バイト文字列::byte string
ブラウザ::browser
プログラマ::programmers
ペア::pair
ページ::page
マーカ::marker
レベル::level
ローマ字入力::romaji input
メソッド::method
	行折り返し::word-wrapping
廃止予定::deprecated
発行::publish
反例::counter-example
非公式的::informal
表示::display
分離::divide
変種::variation
不連続性::discontinuity
名前::name
明示的::explicit
用語::term
意図::intend
保守::maintain
違反::violate
一貫性::consistency
一対一::one-to-one
一対多::one-to-many
仮名::kana
開発-::develop
開発::development
拡大::scale
格納順序::storage order
勧告案::Proposed Recommendation
慣行::convention
基底::base
関数::function
基本的::basic
規範::reference material
技術報告::Technical Report
誤利用::misuse
構造::structure
合成済形式::fully composed form
合致::match
合法的::legitimate
国際化::Internationalization
根本原則::Fundamentals
左右双方向::both directions
作成者::author
始端::start
試験::test
自然言語::human language
自動的::automatic
出力::output
上位互換::upwards-compatible
数学記号::mathematical symbol
数値::number
制限::limit
制御::control
制御文字::control codes
任意選択::OPTIONAL
等価::equivalent
動的::dynamical
判別可能::distinguish 可能
反映::reflect
汎用::general
範囲::range
番号::index
非負整数::non-negative integer
筆記::writing
表引き::lookup
復号::decode
複製::copy
成分::component
設計::design
選好::preference
選択肢::choice
選択範囲::selection
前処理::pre-processing
全範囲::full range
相互運用可能::interoperable
送信::sent
属性::attribute
多言語環境::multilingual environment
多対一::many-to-one
多対多::many-to-many
妥当::valid
対象言語::target languages
画像::image

非電子媒体:::non-electronically 媒体
否認:::disallow
一定量:::certain amount
可変個:::variable number
可変長:::variable-length
実用的:::pragmatical
詳細:::details
精確:::precise
設計変更:::redesign
半規則的:::semi-regular
不規則:::irregular
子音的::consonantal
内部:::internal
堅牢:::robust
提案:::propose
適応:::accommodate
適格:::admissible
適性:::suitability
適切:::appropriate
統計的解析:::statistical analysis
混成:::intermingled
除外:::exclude
衝突:::collide
外部:::external
列挙:::enumerate
論題:::topic
規則:::rule
狭間:::between
支配:::govern
重大:::crucial
重要:::important
自明:::trivial
柔軟:::flexible

修飾形:::modified
終端:::end

	選定-:identify/choose
選定-:::choose
選定:::choice
	採用:choose

事由:::reason
技術委員会:::technical committees
姉妹:::companion
施策:::policy
合意:::agreement
依拠:::rely
一意的:::unique
課題:::issue
獲得:::capture
境界:::boundary
機能性:::functionality
裁定:::decision
決定:::determine
決定的:::vitally
結果:::result
誤解:::misunderstanding
醸成:::create
節:::section
側面:::aspect
尊守:::adhere
脱落:::eliminate
知識:::knowledge
定義:::definition
定義-:::define
同調的:::synchrony
日常:::everyday

発生:::arise
発展:::evolving
表現:::representation
表現-:::represent
片:::piece
本質:::essence
有用:::useful
予測可能:::predictable
例外:::preclude
生成:::produce
独立:::independent
組織:::organization
前置:::precede
数学的:::mathematical
明白:::clear
表記-:::express
表記:::expression
一義的:::unambiguous
絵図:::picture
考察:::examine
検討:::weigh
逸脱:::deviate

因子:::factor
影響-:::affect
影響:::implications
英文:::English sentence
応用:::application
恩恵:::benefit
通信:::communication
記法:::notation


外側:::outside
確実:::reliable
確保:::ensure
観測:::observe
関心:::interest
基礎:::basis
技術:::technology
許容:::allow
禁止:::prohibit
形容詞:::adjective
経験:::experience
継承:::inherit
検査:::check
見方:::view
原理的:::in principle
個人:::individuals
誤認:::mistaken


候補:::candidate
効率的:::efficient
効率性:::efficiency
更新:::update
根本的:::radical
採択:::adopt
索引項目:::glossary entry
産業界:::industry
指定:::specify
指定1:::designate
思考:::mind
自然:::natural
辞書:::dictionary
実在:::reality
実施:::practice
実践:::practice
取得:::retrieval
手段:::means
終了:::terminate
集合体:::collection
	足る:::sufficient
柔軟性:::flexibility
序論:::Introduction
将来:::future
冗長:::redundant

成果:::work
成長:::grow
正方形:::square
素直:::straightforward
組織化:::organize
創造性:::creativity
挿入:::insert
操作:::manipulation
早見表:::quick reference
相違:::difference


多様性:::variation
代表的:::typical
大文字・小文字::upper and lower case letters
単純:::simple
単方向::unidirectional
方向:::direction
団体:::party
段落:::paragraph
置換:::replace
中核:::core
聴覚::aural
直接的::direct
提供:::provide
展開:::expanding
伝統的:::traditional
伝統:::tradition
電話帳:::telephone book
特質:::nature
特殊:::special
特徴的:::interesting
日常利用:::everyday use
同義語:::synonym
認識:::recognize
廃止:::deprecate
背景:::background
配慮:::care
反対:::reverse
不死鳥:::phoenix
不完全:::imperfect
不十分:::incomplete
不適切:::inappropriate
不連続::discontiguous
物理的:::physical
文化:::culture
文化継承:::cultural heritage
文化的慣習:::cultural conventions
文書順::document order
変化:::change
変更:::change
編集:::edit
保持:::keep
保証:::guarantee
補完:::complete
豊富:::wealth
翻訳:::translation
密接:::close
無視:::ignore
明確化:::clarify
綿密:::careful
目標:::goal
優位性:::advantage
優先順位:::priorities
様相:::phenomena
抑止:::discourage
利用可能:::available
理解:::understand
理由:::reason
留意:::be aware
歴史的:::historical
列挙一覧:::enumerated list
連続的::continuous
労力:::effort
枠内:::block
恣意的:::arbitrary

	曖昧さ::ambiguity
曖昧性:::ambiguity
	ambiguous
世界:::around the world
必要性:::needs
頻度:::frequency
可能性:::possibility
キー押下::keypress
顧客:::customers
時代:::age
歴史:::history
要求:::require
仕組み:::mechanisms
推奨:::recommend
補正:::adjusting


●●spec_metadata



最新バージョン
    http://www.w3.org/TR/charmod/ 
以前のバージョン
    http://www.w3.org/TR/2004/PR-charmod-20041122/ 

編集
	Martin J. Dürst, W3C <a href="mailto:duerst@w3.org">&lt;duerst@w3.org&gt;</a>
	François Yergeau (Invited Expert)
	Richard Ishida, W3C <a href="mailto:ishida@w3.org">&lt;ishida@w3.org&gt;</a>
	Misha Wolf (until Dec 2002), Reuters Ltd. <a href="mailto:misha.wolf@reuters.com">&lt;misha.wolf@reuters.com&gt;</a>
	Tex Texin (Invited Expert), XenCraft <a href="mailto:tex@XenCraft.com">&lt;tex@XenCraft.com&gt;</a>

<a href="http://www.w3.org/2005/02/charmod-fundamentals-errata.html">正誤表</a>
	規範的な修正が含まれることもあります。

<a href="http://www.w3.org/2003/03/Translations/byTechnology?technology=charmod">各国語翻訳</a>
	英語版のみがこの仕様の規範的バージョンです。


●●trans_metadata
<p>
~THIS_PAGEは、 W3C により勧告として公開された
Character Model for the World Wide Web 1.0: Fundamentals
を日本語に翻訳したものです。
~PUB
</p>

</script>


<!-- 
格納単位:sec-Storage
格納:sec-Storage

		map['符号化法'] = 'character-encoding';//etc.

CDATA セクション:http://www.w3.org/TR/REC-xml/#sec-cdata-sect
XML 宣言
テキスト宣言
encoding 疑似属性
charset レジストリ
XML テキスト
XML 仕様
 -->
<!-- UNICODE 用語対訳
https://www.unicode.org/terminology/term_en_ja.html
-->


<!--% その他の対訳

	ふるまい::behavior
	位置 position, locate
	意味:::meaning
	意味:::significance
	機能1:::functionality
	最終的に:::finally
	最新の:::latest
	草創期:::early days
	多様:::large variety
	大幅:::significant
	単一の:::single
	単独の:::single

	一定層の:set of
	実際::actual/...
	容易にする::facilitate/...
	適する:suitable
	解決:address/resolve/resolution

	築く:build
	目的::purpose/target
	個々:individual

	〜として指定:designate

	エンコーディング: 1
	シャクル: 1
	ニクダー: 1

	複数バイト:multibyte quantities
	割合が更に高まっている::more and more places
	16 進:hexadecimal
	その場で:on the fly
	できなくなる:prevent
	に基づく:oriented
	に登録済みの:IANA-registered IANA
	バージョン付け:versioning
	ほんの触りしか取り挙げられてない:barely touched
	まとめられ:aggregate
	みなす:consider
	やりとり:exchange
	より適切な形にあつらえる:tailoring
	テキストとしての特質:textual nature
あつらえの:tailored
	関心を持つ団体:interested individuals
	機構が既に用意されている:facilities offered
	右から左へ配置する:laid out right-to-left
	文字エスケープの展開:expanding-a-character-escape
	文字の並び:sequence of characters
	矛盾無く定義され:well-defined
	明確な指示書き:Well-defined instructions
	予め合意され:agreed-upon
	由来する:derives from
	依拠できる知識:reliable knowledge
	失わせ:disregarding
	実際に機能する:effective
	実装の候補:likely candidates for implementation
	主要な部分を占める:primary concern
	取り挙げる:addressed
	取り込まれた:embrace

	国:national governments
	データベースベンダ:database vendor
	利用者グループ
	企業:corporation
	研究機関:research institutions
	国際的機関:international agencies
	国際電気標準会議:International Electrotechnical Commission
	国際標準化機構:International Organization for Standardization

	メンバ: 3
	以外: 3
	以上: 1
	以前: 2
	位置付け: 4
	依存: 16
	囲う:surround
	意味: 26
	異なる:different
	一貫した:consistent
	一種類:one
	一緒:together with
	一段: 1
	一致: 2
	一定: 4
	一定の:certain
	一定範囲: 2
	一点:point
	一般に:typically
	一部: 22
	一部含: 1
	一部分:parts
	一方: 7
	一覧: 15
	一連: 1
	引数:argument
	押し下げ:presses
	押下: 2
	可能: 8
	過程:process/ 3
	解する:Understanding
	解り難い:obscure
	改訂:revised
	修正:amended
	各位置: 1
	拡張: 3
	確実に:reliably
	確定できない:ambiguous
	割り当て:allocated,assignment
	活動的に:actively
	完全: 4
	間違った:erroneous
	関わらず:regardless
	関係:relationship/...
	関連付: 1
	関連付ける:relate
	基づく:based
	既存: 2
	既知の:known
	期待: 1
	規定する:define:~
	規定する:prescribe:~
	起因: 1
	義務付: 4
	逆に:contrary to
	急速に:quickly
	巨大:large
	供する:provide
	共通の:common
	共同:jointly
	協同: 3
	強まる:strengthen
	近似:close
	区切: 1
	区切り:delimit:~
	区別:distinction/distinct/...
	具合: 1
	具合が悪いことに:Unfortunately
	桁目: 1
	結果得: 2
	検知: 1

考案:invent/devise
	考慮:considere/ 4

	現在: 4
	現時点では:currently
	言及: 1
	個数: 2
	個別: 1
	呼ばれる:called
	固定的: 2
	固有: 2
	固有の:inherent

	後者: 2
	後続: 3
	公開:expose 8
	向上: 3
	好ましい:good
	好例:good example
	広範:wide base/widespread
	構成:make up
	構造的:structured
	構築された:building
	構文上の意味
	行われる:took place
	高次:higher
	合間:between
	再順序付:reordering
	最後: 7
	最初: 10
	最小:smallest/minimal
	最小限に抑え:minimizes
	最低: 1
	採用: 7
	採用する:adopt
	裁量に委ね:matter
	作成:create/author
	策定: 1
	三者: 1
	参考: 10
	参考情報:informative
	参照: 43
	参照している:referring
	算出: 1
	仕分:break ties
	仕方: 1
	指示: 1
	支持は得ている:favoured
	事項:item
	持ち込み:impose
	時の経過:time to time/over time
	時点: 2
	次の:next
	次節: 2
	示唆: 1
	自身: 3
	自体: 1

	実際には:actually

	写像: 1
	主要: 3
	手前: 1
	手法: 2
	種々の:various
	種類: 2
	集合: 6
	十全に:quite
	重要でない:unimportant
	述べ-:describe
	順序付: 6
	順序付け:ordering
	順番: 2
	小さな:small: 5
	少数: 3
	少量: 1
	焦点: 1
	場合: 39
	情報交換:liaison
	条件:prerequisite/conditions
	状況:situations/circumstances
	状態:state
	進数: 1
	進数字: 1
	人:human
	人間:human
	推定: 1
	数える:count
	数え方:counting
	数多: 1
	制作: 1
	正常: 1
	製作者:producer
	説明:explain
	先頭: 2

	前掲の:preceding
	前節:previous section
	全体: 4
	全体像: 1
	組み合わせる:combine
	想定: 2
	相当: 2
	捉える:view
	存在: 3
	多種多様: 2
	多重的:duplicate
	多大:great
	妥当性を検証:validate
	対応-:correspond
	対応: 16
	対象: 6
	対象外の:not covered
	大半:most/ 8
	大半の:most
	正常に機能し続ける:not break down
	相当する位置付けのもの:comparable in standing to
	大幅に向上する:significant improvement
	注目される点:interesting aspect
	長く込み入った:long and tortured
	大文字化: 1
	短縮: 4
	段階:
	段目: 1
	地位:part
	中略: 1
	注意: 7
	注意深く:carefully
	注記:note
	通常:usually, normal
	注目:interesting,attention
	直上: 1
	追加の:additional
	綴られ:spell
	程度: 2
	適用: 15
	適用する:apply
	適用対象: 1
	登録済: 3
	都合の良い:favorable
	踏襲: 1
	同じ:same
	同一の:identical
	大小の同一視:igonore case
	同等性比較:equating
	同様: 4
	導入／もたらす:introduces
	特に具体的に:specifically
	特記すべき方法:noteworthy
	特殊な機能: special functions
	特定: 19
	特定の:particular
	特別: 2
	読者:reader/audience
	馴染: 1
	二重引用符:double quote
	日付: 2
	任意数:as many
	年号: 1
	念頭: 2
	把握: 2
	配置:laid out
	発行後:publish 後
	発行日:date of publication
	番目: 2
	避ける:eliminate
	非:non
	必要: 21
	頻繁:many
	付与: 2
	付録: 4
	符号値:numbers
	符号点:
	部分: 5
	復元:recover
	幅広い:wide
	複雑::complex
	複雑さ:complexities
	複数:multiple
	複数個: 5
	正確に:exactly
	展開:expanding
	専用:dedicated
	道:course
	文脈付: 1
	文脈付け:contextualize
	平和: 1
	並び:sequence
	並べ方:sequencing
	別の:alternative
	別の:another
	変更点:Changes
	編集上:editorial
	返す:return
	便宜:benefit
	保証され:guaranteed
	補助: 1
	抱えている:carries
	方法: 20
	方法:way
	豊富な:wealth
	本文:body/main body
	埋め込み:inclusion
	末尾:end

	明確:sepecifc/clear
	明白でない:unclear
	目的以外: 1
	問い:questions
	問題:matter/problematic/issue
	問題になる:problematic
	役立: 1
	唯一:only
	優先する:prefer
	有益: 1
	有益な:helpful
	有無: 2
	余地無: 1
	余分:
	用意: 3
	
	要する:require
	要約:In short/summary/abstract

	用途:use
	用法:usage/use
	利用: 108
	利用する側:recipients
	理に適った:reasonable
	理想的:ideally
	両者: 3
	両端: 2
	例:example
	例外的:exceptional
	例示: 4
	連続-:consecutive
	連続: 1
	孕む:involve
	曖昧さ:ambiguity

	増加: 2
	追加: 8
	統一: 1
	多数: 2
	以下: 5
	一定個数: 1
	一般: 5
	一般的: 5
	引用:citing
	関連: 7
	規定: 12
	許容: 11
	経過: 1
	指示書き: 1
	実現: 1
	謝辞:Acknowledgements
	早期: 1


-->


<body>


<header class="head">



	<hgroup>
<h1 title="Character Model for the World Wide Web 1.0: Fundamentals">ウェブのための文字モデル 1.0：根本原則</h1>
<h2 title="W3C Recommendation 15 February 2005">2005 年 2 月 15 日付 W3C 勧告</h2>
	</hgroup>

<details id="_trans_metadata"><summary></summary></details>

<details id="_spec_metadata"><summary>仕様メタデータ</summary></details>

<details><summary>©</summary><small class="copyright">
<a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a>&nbsp;©&nbsp;2005&nbsp;<a href="http://www.w3.org/"><abbr title="World Wide Web Consortium">W3C</abbr></a><sup>®</sup> (<a href="http://www.csail.mit.edu/"><abbr title="Massachusetts Institute of Technology">MIT</abbr></a>, <a href="http://www.ercim.org/"><abbr title="European Research Consortium for Informatics and Mathematics">ERCIM</abbr></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.
</small></details>

</header>

<hr>

<section id="abstract">
<h2 title="Abstract">要約</h2>

<p>
この概念的枠組み仕様は、
仕様の作成者／ソフトウェア開発者／内容開発者
に向けて、
Unicode 標準（ Unicode Standard ）と ISO/IEC 10646 の共同により定義された Universal Character Set の上に築かれた， ウェブ上の相互運用可能なテキスト操作のための、共通の基準を提供する。
この文書は、次の論題について取り挙げる：
用語［
“<span class="qterm">文字</span>”,
“<span class="qterm">符号化法</span>”,
“<span class="qterm">文字列</span>”
］の用法,
基準処理モデル,
文字符号化法の選定と識別,
文字エスケープ法,
文字列の付番（ indexing ）
<span lang="en">
This Architectural Specification provides authors of specifications, software developers, and content developers with a common reference for interoperable text manipulation on the World Wide Web, building on the Universal Character Set, defined jointly by the Unicode Standard and ISO/IEC 10646. Topics addressed include use of the terms 'character', 'encoding' and 'string', a reference processing model, choice and identification of character encodings, character escaping, and string indexing.
</span></p>

<p>
文字列の正規化と同一性合致検出については、姉妹文書
“ウェブのための文字モデル 1.0 ：正規化”
—
<cite>Character Model for the World Wide Web 1.0: Normalization</cite>
<a href="#charnorm">[CharNorm]</a>
を参照されたし。
リソース識別子については、姉妹文書
“ウェブのための文字モデル 1.0 ：リソース識別子”
—
<cite>Character Model for the World Wide Web 1.0: Resource Identifiers</cite>
<a href="#charmod3">[CharIRI]</a>
を参照されたし。
<span lang="en">
For normalization and string identity matching, see the companion document Character Model for the World Wide Web 1.0: Normalization [CharNorm]. For resource identifiers, see the companion document Character Model for the World Wide Web 1.0: Resource Identifiers [CharIRI].
</span></p>

</section>
<section id="status">

<h2 title="Status of this document">この文書の位置付け</h2>

<p>
<em>この節では、発行時点におけるこの文書の位置付けについて述べます。他の文書がこの文書に取って代わる可能性があります。W3C の現在の発行文書一覧とこの文書の最新の状態は http://www.w3.org/TR/
<a href="http://www.w3.org/TR/">W3C technical reports index</a>
にて見られます。</em>
<span lang="en">
This section describes the status of this document at the time of its publication. Other documents may supersede this document. A list of current W3C publications and the latest revision of this technical report can be found in the W3C technical reports index at http://www.w3.org/TR/.
</span></p>

<p>
この文書は、
<a href="http://www.w3.org/2003/06/Process-20030618/tr.html#RecsW3C"> W3C 勧告</a>
<cite>Character Model for the World Wide Web 1.0: Fundamentals</cite>
仕様です。
この文書は W3C メンバや関心を持つ団体から吟味され、ディレクターにより W3C 勧告として承認されたものです。
これは安定的な文書であり、規範として利用したり，規範的な文献として他の文書に引用することができます。
勧告の発行における W3C の役割は、仕様に対する注目を集め，広範囲への普及を促進する所にあります。
これはウェブの相互運用性と機能性を向上させるものです。
<span lang="en">
This document contains the Character Model for the World Wide Web 1.0: Fundamentals specification, and is a W3C Recommendation. It has been reviewed by W3C Members and other interested parties and has been endorsed by the Director. It is a stable document and may be used as reference material or cited as a normative reference from another document. W3C's role in making the Recommendation is to draw attention to the specification and to promote its widespread deployment. This enhances the functionality and interoperability of the Web.
</span></p>

<p>
この文書は、 Internationalization Interest Group の援助の下に
<a href="http://www.w3.org/International/core/">W3C Internationalization Core Working Group</a>
による
<a href="http://www.w3.org/International/Activity">W3C
Internationalization Activity</a>
の一環として開発されました。
<span lang="en">
This document was developed as part of the W3C Internationalization Activity by the W3C Internationalization Core Working Group, with the help of the Internationalization Interest Group.
</span></p>

<p>
この文書に関し，コメントがあれば、
<a href="mailto:www-i18n-comments@w3.org">www-i18n-comments@w3.org</a>
（<a href="https://lists.w3.org/Archives/Public/www-i18n-comments/">公開アーカイブ</a>）
宛てまで寄せられるよう願います。
用済みになった Last Call は
<a href="http://www.w3.org/2004/02/charmod1-lastcall/">公開バージョン</a>
と
<a href="http://www.w3.org/International/Group/2004/charmod1-lc/">メンバ専用バージョン</a>
から入手できます。
<a href="/2004/11/charmod-implementation/">実装報告</a>
も入手できます。
勧告案の段階から勧告バージョンまでの間に この文書に加えられた変更については
付録 E：<a href="#sec-Changes"><b>勧告案からの勧告までの間の変更点</b></a>
に記載されています。
<span lang="en">
If you have comments on this document, send them to www-i18n-comments@w3.org (public archive). Last Call dispositions are available in a public version and a Members-only version. There is also an implementation report. Changes to this document since the Proposed Recommendation version are detailed in E Changes since the Proposed Recommendation.
</span></p>

<p>
この文書は
<a href="http://www.w3.org/TR/2002/NOTE-patent-practice-20020124">24 January 2002 CPP</a>
<a href="http://www.w3.org/2004/02/05-pp-transition">W3C Patent Policy Transition Procedure</a>
に従事するグループにより制作されました。
Working Group はグループの成果物に関連して、この文書に関わる
<a href="http://www.w3.org/2004/01/pp-impl/32113/status">public list of patent disclosures</a>
を作成し保守しています。
そのページには特許開示の手引きも含まれています。
特許に関する実際的知識を持ち、そこにこの仕様に対する Essential Claim(s) が含まれていると主張する者は
<a href="http://www.w3.org/Consortium/Patent-Policy-20040205/#sec-Disclosure">W3C Patent Policy 6 節</a>
に則って情報を公開するべきです。
<span lang="en">
This document was produced under the 24 January 2002 CPP as amended by the W3C Patent Policy Transition Procedure. The Working Group maintains a public list of patent disclosures relevant to this document; that page also includes instructions for disclosing a patent. An individual who has actual knowledge of a patent which the individual believes contains Essential Claim(s) with respect to this specification should disclose the information in accordance with section 6 of the W3C Patent Policy.
</span></p>

</section>

<nav id="contents" class="toc">
<h2 title="Table of Contents">目次</h2>

</nav>

<!-- 
<nav id="appendices">
<h3 title="Appendices"><a>Appendices</a></h3>
</nav>
 -->

<hr>

<main id="MAIN">

<section id="sec-Intro">
<h2 title="Introduction">1. 序論</h2>

	<section id="sec-GoalsScope">

<h3 title="Goals and Scope">1.1. 目標と視野</h3>

<p>
“ウェブのための文字モデル” の目標は、W3C が目指す世界共通のアクセス
—
<a href="http://www.w3.org/Consortium/mission.html"><cite>W3C goal of universal access</cite></a>
に則り、すべての人々から，彼らの
言語／用字系（用字, 文字体系）／表記体系／文化的慣習
に関わらず、ウェブ（ World Wide Web ）を利用し易くすることである。
この目標に欠かせない基本的な条件は、世界で利用されている文字を，矛盾無く定義しつつ, 理解し易い方法により，伝送／処理を行えるようにすることである。
<span lang="en">
The goal of the Character Model for the World Wide Web is to facilitate use of the Web by all people, regardless of their language, script, writing system, and cultural conventions, in accordance with the W3C goal of universal access. One basic prerequisite to achieve this goal is to be able to transmit and process the characters used around the world in a well-defined and well-understood way.
</span></p>

<p>
この仕様の主な読者には、 W3C 仕様の開発者が想定されている。
この仕様あるいはその一部分は、他の W3C 仕様から参照され得る。
これは、他の仕様とともに W3C 仕様に対する適合性基準を定める。
<span lang="en">
The main target audience of this specification is W3C specification developers. This specification and parts of it can be referenced from other W3C specifications. It defines conformance criteria for W3C specifications as well as other specifications.
</span></p>

<p>
この仕様が想定している他の読者には、
ソフトウェア開発者／内容開発者／ W3C の外側で策定される仕様の作成者
も含まれている。
ソフトウェア開発者と内容開発者は W3C 仕様を実装し, 利用する。
この仕様は、 W3C 仕様を実装し, 利用する，実装（ソフトウェア）, および内容に対し、いくつかの適合性基準を定める。
これはまた、ソフトウェア開発者や内容開発者達が， W3C 仕様に含まれる文字に関する規定を理解するための助けにもなる。
<span lang="en">
Other audiences of this specification include software developers, content developers, and authors of specifications outside the W3C. Software developers and content developers implement and use W3C specifications. This specification defines some conformance criteria for implementations (software) and content that implement and use W3C specifications. It also helps software developers and content developers to understand the character-related provisions in W3C specifications.
</span></p>

<p>
この仕様に述べられる文字モデルは、 ウェブ上のテキスト操作を，一貫性を備えた, 相互運用可能なものにするための 共通の基準を、
仕様の作成者／ソフトウェア開発者／内容開発者
に提供する。
これら三者の協同により，より国際的なウェブを築くことが可能になる。
<span lang="en">
The character model described in this specification provides authors of specifications, software developers, and content developers with a common reference for consistent, interoperable text manipulation on the World Wide Web. Working together, these three groups can build a more international Web.
</span></p>

<p>
この “ウェブのための文字モデル” — Character Model for the World Wide Web
の根本原則 編で取り挙げられる論題には次のものがある：
語［
“<span class="qterm">文字</span>”,
“<span class="qterm">符号化法</span>”
“<span class="qterm">文字列</span>”
］の用法,
基準処理モデル,
文字符号化法の選定と識別,
文字エスケープ法,
文字列の付番
<span lang="en">
Topics addressed in this part of the Character Model for the World Wide Web include use of the terms 'character', 'encoding' and 'string', a reference processing model, choice and identification of character encodings, character escaping, and string indexing.
</span></p>

<p>
Character Model の他の部分は、正規化と文字列の同一性合致検出（
<a href="#charnorm">[CharNorm]</a>
）と
IRI （ Internationalized Resource Identifiers ）の変換（
<a href="#charmod3">[CharIRI]</a>
）からなる。
<span lang="en">
Other parts of the Character Model address normalization and string identity matching ([CharNorm]) and Internationalized Resource Identifiers (IRI) conventions ([CharIRI]).
</span></p>

<p>
まだ全く, あるいは ほんの触りしか取り挙げられてない論題には、曖昧照合や, 言語のタグ付けなどがある。
これらの論題の一部については、この仕様の将来バージョンにて，解決されるものと期待される。
<span lang="en">
Topics as yet not addressed or barely touched include fuzzy matching, and language tagging. Some of these topics may be addressed in a future version of this specification.
</span></p>

<p>
モデルの中核を担うのは、
Unicode 標準
<a href="#unicode">[Unicode]</a>
と ISO/IEC 10646
<a href="#iso10646">[ISO/IEC 10646]</a>
の協同により定義された， Universal Character Set （ UCS ）である。
この文書は、 Universal Character Set の同義語として，語
<dfn id="Unicode">Unicode</dfn>
を用いる。
モデルは、世界の用字系（および様々なプラットフォーム）で作成されたウェブ文書を、世界のウェブ利用者の間で，交換, 読み取り, 検索できるようにする。
<span lang="en">
At the core of the model is the Universal Character Set (UCS), defined jointly by the Unicode Standard [Unicode] and ISO/IEC 10646 [ISO/IEC 10646]. In this document, Unicode is used as a synonym for the Universal Character Set. The model will allow Web documents authored in the world's scripts (and on different platforms) to be exchanged, read, and searched by Web users around the world.
</span></p>

	</section>
	<section id="sec-Background">

<h3 title="Background">1.2. 背景</h3>

<p>
この節では、この仕様で取り挙げられる論題についての歴史的な背景をいくつか述べる。
<span lang="en">
This section provides some historical background on the topics addressed in this specification.
</span></p>

<p>
HTML の国際化 —
<cite>Internationalization of the Hypertext Markup Language</cite>
<a href="#rfc2070">[RFC 2070]</a>
から始まり、ウェブのための文字モデルの必要性が，ウェブのコミュニティから認識されるようになった。
このモデルを築き上げる最初の段階では、 HTML 文書のための文字集合として Unicode を採用する所から取り掛かられた。
<span lang="en">
Starting with Internationalization of the Hypertext Markup Language [RFC 2070], the Web community has recognized the need for a character model for the World Wide Web. The first step towards building this model was the adoption of Unicode as the document character set for HTML.
</span></p>

<p>
Unicode が選定されたのは、次を備えていたからである：
<span lang="en">
The choice of Unicode was motivated by the fact that Unicode:
</span></p>

<ul>
	<li>
世界共通の文字レパートリとして，唯一利用できるものであったこと,
<span lang="en">
is the only universal character repertoire available,
</span></li>
	<li>
文字を参照する方法に，テキストの符号化法から独立なものを供していること,
<span lang="en">
provides a way of referencing characters independent of the encoding of the text,
</span></li>
	<li>
綿密に更新／補完されてきたこと,
<span lang="en">
is being updated/completed carefully,
</span></li>
	<li>
広く受け入れられ，産業界で実装されてきたこと
<span lang="en">
is widely accepted and implemented by industry.
</span></li>
</ul>

<hr>

<p>
W3C は
<a href="#html40">[HTML 4.0]</a>
において， HTML 文書のための文字集合に Unicode を採用した。
後に，同じアプローチが
XML 1.0 <a href="#xml10">[XML 1.0]</a>
や CSS2 <a href="#css2">[CSS21]</a>
などの仕様にも踏襲された。
W3C 仕様とアプリケーションは今や、共通の基準となる文字集合に，Unicode を利用している。
<span lang="en">
W3C adopted Unicode as the document character set for HTML in [HTML 4.0]. The same approach was later used for specifications such as XML 1.0 [XML 1.0] and CSS2 [CSS2]. W3C specifications and applications now use Unicode as the common reference character set.
</span></p>

<p>
ウェブ上でのデータ転送が，まだ（サーバからブラウザへの）単方向が大部分を占めていた，主な目的が文書の描画であった頃は、追加の詳細を指定しないままの Unicode の利用でも足りていた。
しかしながら、ウェブは成長した：
<span lang="en">
When data transfer on the Web remained mostly unidirectional (from server to browser), and where the main purpose was to render documents, the use of Unicode without specifying additional details was sufficient. However, the Web has grown:
</span></p>

<ul>
 	<li>
サーバ, プロキシ, クライアント間で、どの方向にも互いにデータを転送し合う利用が増加している。
<span lang="en">
Data transfers among servers, proxies, and clients, in all directions, have increased.
</span></li>
	<li>
US-ASCII
<a href="#iso646">[ISO/IEC 646]</a><a href="#MIME-charset">[MIME-charset]</a>
レパートリに納まらない文字が利用される割合が更に高まっている。
<span lang="en">
Characters outside the US-ASCII [ISO/IEC 646][MIME-charset] repertoire are being used in more and more places.
</span></li>
	<li>
異なる プロトコル／データ形式 の要素（要素／属性の名前, URI の各種成分, テキスト内容など）によるデータ転送が増加している。
<span lang="en">
Data transfers between different protocol/format elements (such as element/attribute names, URI components, and textual content) have increased.
</span></li>
	<li>
プロトコルやデータ形式のみならず，更に多数の API が定められてきている。
<span lang="en">
More and more APIs are defined, not just protocols and formats.
</span></li>
</ul>

<hr>


<p>
要約すれば、ウェブは 互いに独立な小さなアプリケーションの集合体ではなく，一つのとても巨大なアプリケーションのように見えてきている（
<a href="#Nicol">[Nicol]</a>
参照）。
<span lang="en">
In short, the Web may be seen as a single, very large application (see [Nicol]), rather than as a collection of small independent applications.
</span></p>

<p>
これらの開発により、 Unicode がウェブのための文字モデルの基礎になるための要求が強まる一方で，
Unicode のウェブへの応用について 追加の仕様を作成する必要も生じてきている。
ウェブのために Unicode に要する追加の仕様には、次のものが挙げられる：
<span lang="en">
While these developments strengthen the requirement that Unicode be the basis of a character model for the Web, they also create the need for additional specifications on the application of Unicode to the Web. Some aspects of Unicode that require additional specification for the Web include:
</span></p>

<ul>
 	<li>
Unicode 符号化形式（ UTF-8 ／ UTF-16 ／ UTF-32 ）の選定。
<span lang="en">
Choice of Unicode encoding forms (UTF-8, UTF-16, UTF-32).
</span></li>
	<li>
可変長の文字符号化法や, 結合文字の下での、文字数や, 文字列の長さの算出。
<span lang="en">
Counting characters, measuring string length in the presence of variable-length character encodings and combining characters.
</span></li>
	<li>
文字の多重的な符号化法（例えば 合成済か分解済か）。
<span lang="en">
Duplicate encodings of characters (e.g. precomposed vs decomposed).
</span></li>
	<li>
種々の目的のための制御文字の利用（例えば 双方向テキストの制御, 対称交換, 等々）。
<span lang="en">
Use of control codes for various purposes (e.g. bidirectionality control, symmetric swapping, etc.).
</span></li>
</ul>
<!-- symmetric swapping
http://en.wiktionary.org/wiki/Appendix:Control_characters
Enables or disables the mirroring of specific glyphs, such as the parentheses, in a bidirectional context. Deprecated.
 -->
<hr>

<p id="def-legacyEnc">
この種の側面は，種々の符号化法にも存在し、多くの場合，何らかの方法で<!--  --> あるいは別の形で Unicode に受け継がれていることも、挙げておくべきであろう。
<span lang="en">
It should be noted that such aspects also exist in various encodings, and in many cases have been inherited by Unicode in one way or another from these encodings.
</span></p>

<p>
この仕様の残りの部分では、（ W3C, ISO, IETF による）以前の成果も踏まえ、ウェブのための文字モデルが相互運用性を確保するための追加の要件を示す。
<span lang="en">
The remainder of this specification presents additional requirements to ensure an interoperable character model for the Web, taking into account earlier work (from W3C, ISO and IETF).
</span></p>

<p>
Unicode 標準
<a href="#unicode">[Unicode]</a>
の最初の少数の章に，とても有用な背景 情報が見られる。
Internet 上の文字集合の利用のために IETF により採択された施策が
<a href="#rfc2277">[RFC 2277]</a>
にて文書化されている。
<span lang="en">
The first few chapters of the Unicode Standard [Unicode] provide very useful background reading. The policies adopted by the IETF for on the use of character sets on the Internet are documented in [RFC 2277].
</span></p>

	</section>
	<section id="sec-Notation">

<h3 title="Terminology and Notation">1.3. 記法</h3>

<p>
Unicode 符号位置は <code>U+hhhh</code> のように記される。
ここで “<code>hhhh</code>” は、4 〜 6 個の 16 進数字の並びである。
<span lang="en">
Unicode code points are denoted as U+hhhh, where "hhhh" is a sequence of at least four, and at most six hexadecimal digits.
</span></p>

<p>
例の中では、読者によるカット＆ペーストが行えるようにテキストが用いられている。
読者の環境に適切なフォントが備わっていないと，これらに利用される文字が 意図された通りに現れないので、そのような場合でも理解し得るように配慮されている。
一部の例では，例示の結果が見えることが重要になるので、画像が利用されている。
それらの画像には、それぞれの例に対応するテキストへのリンク（
付録 C：<a href="#sec-ExampleText"><b>テキストの例</b></a>
の中の例示へのリンク）も張られている。
<span class="trans-note">【
この訳ではすべての例示にテキストも添えているので、前述の節は余分なものになっている。
】</span>
<span lang="en">
Text has been used for examples to allow them to be cut and pasted by the reader. Characters used will not appear as intended unless you have the appropriate font, but care has been taken to annotate the examples so that they remain understandable even if you do not. In some cases it is important to see the result of an example, so images have been used; by clicking on the image it is possible to link to the text for these examples in C Example text.
</span></p>

<p class="trans-note">【
用語の対訳について：
Unicode 関連のものは
<a href="https://www.unicode.org/terminology/term_en_ja.html" >Unicode Terminology English - Japanese</a>
から採用している。
】</p>

	</section>
</section>
<section id="sec-Conformance">

<h2 title="Conformance">2. 適合性</h2>

<p>
この節では、仕様, ソフトウェア, ウェブ内容が，この仕様に適合するために満たす必要がある条件が，
この仕様の中でどのように記されるか
について述べる。
<span lang="en">
This section explains the conditions that specifications, software, and Web content have to fulfill to be able to claim conformance to this specification.
</span></p>

<p>
この文書の中で利用される言い回し：
“〜しなければ(〜しては)<em class="rfc2119">ならない</em>”（ = MUST (NOT) ）,
<!-- 
“<em class="rfc2119">要求される</em>”= REQUIRED,
 -->
“〜する<em class="rfc2119">べき</em>(でない)” （ = SHOULD (NOT) ）,
“<em class="rfc2119" title="recommended">推奨される</em>” （ = RECOMMENDED ）,
“〜しても<em class="rfc2119">よい</em>” （ = MAY ）,
<!-- 
「<em class="rfc2119" title="optional">任意選択</em> 」 = “OPTIONAL”,
 -->
は、
RFC 2119
<a href="#rfc2119">[RFC 2119]</a>
に則って解釈されるものとする。
<span lang="en">
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY" and "OPTIONAL" in this document are to be interpreted as described in RFC 2119 [RFC 2119].
</span></p>

<p class="note"><b>注記：</b>
RFC 2119 には、“〜する<em class="rfc2119">べき</em>” として記された要件が、任意選択ではなく，特別な事由が無い限り従うもの，と解釈されなければならないと、明確に記されている：
<q cite="#rfc2119" >
この語, あるいは、形容詞としての “推奨される（ RECOMMENDED ）” は、ある特別な状況においては，特定の事項を無視する妥当な理由が存在し得るが、別の道を選ぶ前に，それが及ぼし得る影響すべてが理解された上で 注意深く検討されなければならないことを意味する
</q>
<span lang="en">
NOTE: RFC 2119 makes it clear that requirements that use SHOULD are not optional and must be complied with unless there are specific reasons not to: "This word, or the adjective "RECOMMENDED", mean that there may exist valid reasons in particular circumstances to ignore a particular item, but the full implications must be understood and carefully weighed before choosing a different course."
</span></p>

<p>
この仕様は、仕様, ソフトウェア, 並びにウェブ内容に対する適合性基準を規定する。
すべての適合性基準には、この文書における適合性基準の対象の検索が容易になるように，次の３種のマーカ：
<i>[仕様]</i> （仕様に適用される判定基準）,
<i>[実装]</i> （ソフトウェア実装に適用される判定基準）,
<i>[内容]</i> （ウェブ内容に適用される判定基準）
が前置される。
<span class="trans-note">【
原文では、それぞれ
"<abbr title="Specification">[S]</abbr>",
"<abbr title="Implementation">[I]</abbr>",
"<abbr title="Content">[C]</abbr>"
で表記
】</span>
<span lang="en">
This specification defines conformance criteria for specifications, for software, and for Web content. To aid the reader, all conformance criteria are preceded by '[X]' where 'X' is one of 'S' for specifications, 'I' for software implementations, and 'C' for Web content. These markers indicate the relevance of the conformance criteria and allow the reader to quickly locate relevant conformance criteria by searching through this document.
</span></p>

<p>
仕様は、次のすべてを満たすとき，この文書に適合する：
<span lang="en">
A specification conforms to this document if it:
</span></p>

<ol>
		<li>
<i>[仕様]</i> が前置されている適合性基準に違反していない。
<span lang="en">
does not violate any conformance criteria preceded by [S],
</span></li>
	<li>
“〜する<em class="rfc2119">べき</em>”,
“<em class="rfc2119">推奨される</em>”
と記された判定基準から逸脱する所では、それに値する事由が文書化されている。
<span lang="en">
documents the reason for any deviation from criteria where the imperative is SHOULD, SHOULD NOT, or RECOMMENDED,
</span></li>
	<li>
適用し得る所では、その仕様に適合する実装に対し，この文書にも適合することを要求している。
<span lang="en">
where applicable, requires implementations conforming to the specification to conform to this document,
</span></li>
	<li>
適用し得る所では、その仕様に適合する内容に対し，この文書にも適合することを要求している。
<span lang="en">
where applicable, requires content conforming to the specification to conform to this document.
</span></li>
</ol>

<p>
実装（ソフトウェア）は、
<i>[実装]</i> が前置されている適合性基準に違反しないとき，この文書に適合する。
<span lang="en">
An implementation (software) conforms to this document if it does not violate any conformance criteria preceded by [I].
</span></p>

<p>
内容は、
<i>[内容]</i> が前置されている適合性基準に違反しないとき，この文書に適合する。
<span lang="en">
Content conforms to this document if it does not violate any conformance criteria preceded by [C].
</span></p>

<p class="note"><b>注記：</b>
仕様に課される要件は、それらの仕様に適合する実装や内容に対し，間接的に要件を課し得る。
同様に、内容に課される要件は、その種の内容を生成するように設計された実装に影響し得る，等々。
<span lang="en">
NOTE: Requirements placed on specifications might indirectly cause requirements to be placed on implementations or content that claim to conform to those specifications. Likewise, requirements placed on content may affect implementations designed to produce such content, and so on.
</span></p>


<p>
この仕様において処理について要件を課す所では、外部に現れるふるまいを指定しているものと，解することとする。
実装は、観測され得るふるまいに違いが無ければ，同じ結果を得る他の方法を利用できる。
<span lang="en">
Where this specification places requirements on processing, it is to be understood as a way to specify the desired external behavior. Implementations can use other means of achieving the same results, as long as observable behavior is not affected.
</span></p>

</section>
<section id="sec-Perceptions">

<h2 title="Perceptions of Characters">3. 文字の知覚</h2>

	<section id="sec-PerceptionsIntro">

<h3 title="Introduction">3.1. 序論</h3>

<p>
Unicode 標準
<a href="#unicode40">[Unicode  4.0]</a>
の索引項目には次のように記されている：
<span lang="en">
The glossary entry in the Unicode Standard [Unicode 4.0] gives:
</span></p>

<p>
<q>
文字 —
(1) 言語が記されるときの，意味を持つ最小の成分であって、抽象的な，意味／形を参照するもの...</q>
<span lang="en">
"Character. (1) The smallest component of written language that has semantic value; refers to the abstract meaning and/or shape ..."
</span></p>

<p>
語
“<span class="qterm">文字</span>”
は、多くの文脈下で，異なる意味を伴って利用されている。
様々な人間の文化で、文字の概念を始め，根本的に異なる表記体系が利用されている。
そのような幅広い多様性の下では、末端利用者は，誤解を経験したり, しばしば誤解につながる結果も産み出すことがある。
この多様性は、不完全な技術に起因するものと誤認されがちであるが、人間の思考の多大な柔軟性と創造性，および 人々の文化継承の中で重要な地位を占める筆記の長い伝統に、由来するものである。
Latin, Cyrillic, Greek などの用字系に利用されているアルファベットは、そういった可能性の中の一つに過ぎない。
<span lang="en">
The word 'character' is used in many contexts, with different meanings. Human cultures have radically differing writing systems, leading to radically differing concepts of a character. Such wide variation in end user experience can, and often does, result in misunderstanding. This variation is sometimes mistakenly seen as the consequence of imperfect technology. Instead, it derives from the great flexibility and creativity of the human mind and the long tradition of writing as an important part of the human cultural heritage. The alphabetic approach used by scripts such as Latin, Cyrillic and Greek is only one of several possibilities.
</span></p>

<p class="example"><b>例：</b>
日本語の 平仮名／片仮名 用字系の文字は、音節（通常は子音と母音の組み合わせ）に対応する。
<span lang="en">
EXAMPLE: A character in Japanese hiragana and katakana scripts corresponds to a syllable (usually a combination of consonant plus vowel).
</span></p>

<p class="example"><b>例：</b>
韓国語のハングルは、言語の個々の声音に対応する記号を正方形の枠内に組み合わせ，そのそれぞれが１個の音節を表現する。
利用者とアプリケーションに依存して、個別の記号, あるいは音節クラスタが文字と見なされ得る。
<span lang="en">
EXAMPLE: Korean Hangul combines symbols for individual sounds of the language into square blocks, each of which represents a syllable. Depending on the user and the application, either the individual symbols or the syllabic clusters can be considered to be characters.
</span></p>

<p class="example"><b>例：</b>
インド語派の用字系は、各々の子音 字が，［
子音と母音をクラスタの中で組み合わせるために，半規則的または不規則な方法を用いて、脱落／置換された
］固有の母音を抱えている。
利用者やアプリケーションに依存して，［
個々の子音や母音, あるいは［
子音や子音-母音
］クラスタ
］が文字として知覚され得る。
<span lang="en">
EXAMPLE: In Indic scripts each consonant letter carries an inherent vowel that is eliminated or replaced using semi-regular or irregular ways to combine consonants and vowels into clusters. Depending on the user and the application, either individual consonants or vowels, or the consonant or consonant-vowel clusters can be perceived as characters.
</span></p>

<p class="example"><b>例：</b>
アラビア語とヘブライ語では、母音 声音は概して，全く書き記されない。
それらが書き記されるときは、子音的な字の上／下に置かれる結合マーク
<span class="trans-note">【
<a href="https://ja.wikipedia.org/wiki/%E3%82%B7%E3%83%A3%E3%82%AF%E3%83%AB" >シャクル</a>,
<a href="https://ja.wikipedia.org/wiki/%E3%83%8B%E3%82%AF%E3%83%BC%E3%83%89">ニクダー</a>
】</span>
の利用により指示される。
<span lang="en">
EXAMPLE: In Arabic and Hebrew vowel sounds are typically not written at all. When they are written they are indicated by the use of combining marks placed above and below the consonantal letters.
</span></p>

<p>
それらの仕様に基づく仕様の開発者／ソフトウェアの開発者は、彼らが経験してきた 語 “<span class="qterm">文字</span>” の用法に，より馴染んでいて、国際的文脈の下での多様な用法には疎くなりがちである。
更に、コンピュータ利用の文脈では、文字はしばしば関連の概念と混同され，不十分あるいは不適切な仕様やソフトウェアが作り上げられることがある。
<span lang="en">
The developers of specifications, and the developers of software based on those specifications, are likely to be more familiar with usages of the term 'character' they have experienced and less familiar with the wide variety of usages in an international context. Furthermore, within a computing context, characters are often confused with related concepts, resulting in incomplete or inappropriate specifications and software.
</span></p>

<p>
この節では、これらのうちいくつかの 文脈, 意味, 混同 について考察する。
<span lang="en">
This section examines some of these contexts, meanings and confusions.
</span></p>

	</section>
	<section id="sec-WritingSystem">

<h3 title="Units of aural rendering">3.2. 聴覚レンダリングの単位</h3>

<p>
ある種の用字系では、文字は音素と深く結びついている。
（
<dfn id="phoneme">音素</dfn>
とは、特定の音声言語の文脈の下で区別し得る最小の声音である）。
一方，他のものでは、文字は意味と深く結びついている。
文字が（大まかに）音素に対応する場合でも、この関係は単純でないことが多く，文字と音素の関係が一対一になることは稀である。
<span lang="en">
In some scripts, characters have a close relationship to phonemes (a phoneme is a minimally distinct sound in the context of a particular spoken language), while in others they are closely related to meanings. Even when characters (loosely) correspond to phonemes, this relationship may not be simple, and there is rarely a one-to-one correspondence between character and phoneme.
</span></p>

<p class="example"><b>例：</b>
英文の
“<samp>They were too close to the door to close it.</samp>”
では、 /s/, /z/ いずれの音素の表現にも同じ文字
<span class="qchar">s</span>
が利用される。
<span lang="en">
EXAMPLE: In the English sentence, "They were too close to the door to close it." the same character 's' is used to represent both /s/ and /z/ phonemes.
</span></p>

<p class="example"><b>例：</b>
英語の
“<samp>cool</samp>”
の音素 /k/ は
“<samp>keel</samp>”
の音素 /k/ と似ている。
<span lang="en">
EXAMPLE: In the English language the phoneme /k/ of "cool" is like the phoneme /k/ of "keel".
</span></p>

<p class="example"><b>例：</b>
多くの用字系で、例えば日本語の平仮名の音節文字のように，１個の文字が一連の音素の並びを表現し得る。
<span lang="en">
EXAMPLE: In many scripts a single character may represent a sequence of phonemes, such as the syllabic characters of Japanese hiragana.
</span></p>

<p class="example"><b>例：</b>
多くの表記体系は、例えば
“<samp>thing</samp>”
の
<span class="qchar">th</span>
や
<span class="qchar">ng</span>
のように、複数個の文字の並びが１個の音素を表現し得る。
<span lang="en">
EXAMPLE: In many writing systems a sequence of characters may represent a single phoneme, for example 'th' and 'ng' in "thing".
</span></p>

<div class="req"><h5 id="C001">C001<i>[仕様]</i><i>[実装]</i><i>[内容]</i></h5>
<p>
仕様, ソフトウェア, 内容は、言語の 文字と声音の一対一の対応関係に依存したり，それを要求しては<em class="rfc2119">ならない</em>。
<span lang="en">
C001 [S] [I] [C] Specifications, software and content MUST NOT require or depend on a one-to-one correspondence between characters and the sounds of a language.
</span></p></div>

	</section>
	<section id="sec-VisualRenderingUnits">

<h3 title="Units of visual rendering">3.3. 視覚レンダリングの単位</h3>

<p id="def-glyph">
文字の描画（視覚的レンダリング）は、 <em>グリフ</em> の認識概念（ notion ）をもたらす。
ISO/IEC 9541-1
<a href="#iso9541">[ISO/IEC 9541-1]</a>
によれば、
<dfn id="glyph">グリフ</dfn>
とは，
<q>認識し得る抽象的グラフィック記号であって，特定のデザインに依存しないもの</q>
として，定義されている。
文字とグリフとの間には、一対一の対応関係は<em>成り立たない</em>：
<span lang="en">
Visual rendering introduces the notion of a glyph. Glyphs are defined by ISO/IEC 9541-1 [ISO/IEC 9541-1] as "a recognizable abstract graphic symbol which is independent of a specific design". There is not a one-to-one correspondence between characters and glyphs:
</span></p>

<ul>
	<li>
１個の文字は、複数のグリフにより表現され得る（各グリフはその文字の表現の一部になる）。
これらのグリフは，物理的に別々に分離され得る。
<span lang="en">
A single character can be represented by multiple glyphs (each glyph is then part of the representation of that character). These glyphs may be physically separated from one another.
</span></li>
	<li>
１個のグリフは、複数個の文字の並びを表現し得る（例えば合字など — 他にもいくつかある）。
<span lang="en">
A single glyph may represent a sequence of characters (this is the case with ligatures, among others).
</span></li>
	<li>
<p>
文字は文脈に依存して，全く異なるグリフで描画され得る。
<span lang="en">
A character may be rendered with very different glyphs depending on the context.
</span></p>

<p class="trans-note">【
例えば日本語の縦書きと横書きの下での，一部の文字（句読点や長音その他）。
記号に類するもののみならず、この仕様の<a href="#_arabic-example_" >ペルシャ語の例</a>には，文章内の位置に応じて利用されるグリフが変化する例が示されている。
】</p>
	</li>
	<li>
１個のグリフは、異なる文字を表現し得る（例えば Latin 大文字 A （ U+0041 ）, Greek 大文字 Α （ U+0391 ）, Cyrillic 大文字 А （ U+0410 ）
<span lang="en">
A single glyph may represent different characters (e.g. capital Latin A, capital Greek A and capital Cyrillic A).
</span></li>
</ul>

<hr>

<p>
グリフの集合は
<dfn id="font">フォント</dfn>
を構成する。
文字が，符号化されたテキストを組織化するための基本的な単位をなすのと同じ様に、一連のグリフも，テキストの視覚的描画を組織化するための基本的な単位をなすように構築され得る。
<span lang="en">
A set of glyphs makes up a font. Glyphs can be construed as the basic units of organization of the visual rendering of text, just as characters are the basic unit of organization of encoded text.
</span></p>

<div class="req"><h5 id="C002">C002<i>[仕様]</i><i>[実装]</i><i>[内容]</i></h5>
<p>
仕様, ソフトウェア, 内容は、文字とテキストの表示単位との一対一の対応関係に依存したり，それを要求をしては<em class="rfc2119">ならない</em>。
<span lang="en">
C002 [S] [I] [C] Specifications, software and content MUST NOT require or depend on a one-to-one mapping between characters and units of displayed text.
</span></p></div>

<p>
付録 B：<a href="#sec-CharExamples">文字, キーストローク, グリフの例</a>
に、文字からグリフへの対応関係の複雑さを示す例が挙げられている。
<span lang="en">
See the appendix B Examples of Characters, Keystrokes and Glyphs for examples of the complexities of character to glyph mapping.
</span></p>

		<section id="sec-LogicalOrder">

<h4 title="Visual Rendering and Logical Order">3.3.1. 視覚レンダリングと論理順</h4>

<p>
一部の用字系, 特にアラビア語とヘブライ語は、右から左へ書かれる。
これらの用字系の文字が含まれたテキストは、左右双方向に流れ得るので，双方向テキストと呼ばれる。
Unicode 標準
<a href="#unicode">[Unicode]</a>
では、文字が
<dfn id="logical-order">論理順</dfn>
で格納／交換されることが要求される。
すなわち、ほぼ，テキストがキーボードでタイプされた順番, あるいは話された順番に対応する（詳細な定義は
<a href="#unicode40">[Unicode  4.0]</a>,
2.2 節を参照）。
論理順による順序付けは、データの相互運用性を確保するため, および
アクセス性の向上, 検索, 照合のためにも重要になる。
<span lang="en">
Some scripts, in particular Arabic and Hebrew, are written from right to left. Text including characters from these scripts can run in both directions and is therefore called bidirectional text. The Unicode Standard [Unicode] requires that characters be stored and interchanged in logical order, i.e. roughly corresponding to the order in which text is typed in via the keyboard or spoken (for a more detailed definition see [Unicode 4.0], Section 2.2). Logical ordering is important to ensure interoperability of data, and also benefits accessibility, searching, and collation.
</span></p>

<div class="req"><h5 id="C003">C003<i>[仕様]</i><i>[実装]</i><i>[内容]</i></h5>
<p>
プロトコル／データ形式／ API
は、テキストデータを論理順で格納, 交換, 処理しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C003 [S] [I] [C] Protocols, data formats and APIs MUST store, interchange or process text data in logical order.
</span></p></div>

<p>
双方向テキストの下では、テキスト選択には２つのモードがある。
最初に挙げる
<dfn id="logical-selection-mode">論理的選択モード</dfn>
は、利用者のマウスの動きの両端の間に
<em>論理的に</em>
位置するすべての文字を選択する。
下の図にて，利用者が２番目の単語の先頭の字とその次の字の合間から, 年号の２桁目まで選択した場合の，論理的な選択は、図の２段目のようになる：
<span lang="en">
In the presence of bidirectional text, two possible selection modes can be considered. The first is logical selection mode, which selects all the characters logically located between the end-points of the user's mouse gesture. Here the user selects from between the first and second letters of the second word to the middle of the number. Logical selection looks like this:
</span></p>

<table>
<caption>
論理的選択により視覚的範囲が不連続になる
<span class="summary">（双方向テキストの文脈下における，メモリ内の単独の論理的選択と,
その結果得られるスクリーン上の２個の選択範囲を示す，２つの画像）</span>
<span lang="en">
Logical selection resulting in discontiguous visual ranges
(Two images contrasting a single logical selection in memory and the resulting two selections on screen, in a bidi context)
</span></caption>
<tbody>
<tr><th>視覚表示<td>
<a href="#arabic-select"><img
	src="charmod/logSelScreen.gif"
	height="32" width="144"
	alt="同じ例で，選択テキストが強調表示されたときのスクリーン上での表示を示す画像。
２個の分離された文字ブロックが強調表示される。
The same example, showing how the text would look on-screen when highlighted, showing two separate highlighted character ranges."
></a>

<tr><th>論理順<td>
<a href="#arabic-select"><img
	src="charmod/logSelMemory.gif"
	height="27" width="323"
	alt=
"２個のアラビア語の単語に年号が後続する文字列における論理順による文字の並び。
２番目の単語の途中から年号の途中までの範囲に入る文字が選択されたとするとき、論理選択モードでは，強調表示される範囲が１個の連続的な文字並びになる。
An example showing the logical order of characters in a string containing two Arabic words followed by a year number. In logical selection mode, the range of characters selected by starting the selection in the middle of the second word and ending in the middle of the year number is depicted using highlighting. The highlighting covers a single block of contiguous characters." 
></a>

<tr><th class="trans-note">【テキスト】<td>
<samp >&#x0639;&#x062F;&#x062F; &#x0645;&#x0627;&#x0631;&#x0633; &#x0661;&#x0669;&#x0669;&#x0668;</samp>

<!-- 

عدد مارس ١٩٩٨

-->
</tbody></table>

<p>
したがって，双方向テキストの下では、メモリ内における連続的な論理的選択が，
<em>スクリーン上では不連続に見え得る</em>
ことになる。
この不連続性があるため、一部の利用者からは，マウスの動きの両端に挟まれる中の
<em>視覚的に</em>
位置するすべての文字を選択する，
<dfn id="visual-selection-mode">視覚的選択モード</dfn>
も好まれる。
前の例と同じマウスの動きによる結果は、次の図の様になる：
<span lang="en">
It is a consequence of the bidirectionality of the text that a single, continuous logical selection in memory results in a discontinuous selection appearing on the screen. This discontinuity makes some users prefer a visual selection mode, which selects all the characters visually located between the end-points of the user's mouse gesture. With the same mouse gesture as before, we now obtain:
</span></p>

<table>
<caption>
視覚的選択により論理範囲が不連続になる
<span class="summary">（双方向テキストの文脈下における，スクリーン上の単独の視覚的選択とその結果得られる, メモリ内の２個の選択範囲を示す，２つの画像）</span>
<span lang="en">
Visual selection resulting in discontiguous logical ranges
(Two images contrasting a single visual selection on screen and the resulting two selections in memory, in a bidi context)
</span></caption>
<tbody>
<tr><th>視覚表示<td>
<a href="#arabic-select"><img
	src="charmod/visSelScreen.gif"
	height="33" width="141"
	alt="同じ例で，選択テキストが強調表示されたときのスクリーン上での表示を示す画像。単独の連続する文字ブロックが強調表示される。
same example, showing how the text would look on-screen when highlighted, showing
a single highlighted block of contiguous characters."></a>

<tr><th>論理順<td>
<a href="#arabic-select"><img
	src="charmod/visSelMemory.gif"
	height="27" width="343"
	alt=
"２個のアラビア語の単語に年号が後続する文字列における論理順による文字の並び。
２番目の単語の途中から年号の途中までの範囲に入る文字が選択されたとするとき、視覚的選択モードでは，強調表示される範囲が２個の分離された文字ブロックになる。
An example showing the logical order of characters in a string containing two Arabic words followed by a year number. In visual selection mode, the range of characters selected by starting the selection in the middle of the second word and ending in the middle of the year number is depicted using highlighting. The highlighting covers two separate blocks of characters."></a>

<tr><th class="trans-note">【テキスト】<td>
<samp >&#x0639;&#x062F;&#x062F; &#x0645;&#x0627;&#x0631;&#x0633; &#x0661;&#x0669;&#x0669;&#x0668;</samp>

</tbody></table>

<p>
上の例に見られるように，視覚的選択モードにおいては、
１個の視覚的選択範囲から <em>複数個</em> の論理範囲が得られ得るので、プロトコル, API, 実装による適応の必要が生じ得る。
双方向テキストのための利用者インタフェースに関連する，他の側面としては、キャレットの動き, backspace／delete キーのふるまいなどがある。
<span lang="en">
In visual selection mode, as seen in the example above, a single visual selection range may result in two or more logical ranges, which may have to be accommodated by protocols, APIs and implementations. Other, related aspects of a user interface for bidirectional text include caret movement, behavior of backspace/delete keys, and so on.
</span></p>

<p>
現時点では、大部分の実装が論理的選択を提供しており，視覚的選択を提供しているものはごく限られている。
<span lang="en">
Currently, most implementations provide logical selection, while only very few provide visual selection.
</span></p>

<div class="req"><h5 id="C075">C075<i>[実装]</i></h5>
<p>
選択された文字は、実装の一部が論理的選択／視覚的選択いずれを利用するかに関わらず，格納域においては論理順で保持されなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C075 [I]Independent of whether some implementation uses logical selection or visual selection, characters selected MUST be kept in logical order in storage.
</span></p></div>

<div class="req"><h5 id="C004">C004<i>[仕様]</i></h5>
<p>
範囲の選択を孕むプロトコルと API の仕様は、それらのプロトコルや API を通して，少なくとも スクリーン上の視覚的選択の実装をサポートするために必要な程度の，不連続な論理的選択を提供する<em class="rfc2119">べき</em>である。
<span lang="en">
C004 [S] Specifications of protocols and APIs that involve selection of ranges SHOULD provide for discontiguous logical selections, at least to the extent necessary to support implementation of visual selection on screen on top of those protocols and APIs.
</span></p></div>

		</section>
	</section>
	<section id="sec-InputUnits">

<h3 title="Units of input">3.4. 入力の単位</h3>

<p>
キーボード入力においては，キーストロークと入力される文字に一対一の対応関係があるとは<em>限らない</em>。
キーボードのキーの個数は限られている。
一部のキーボードは、１回のキー押下から複数個の文字を生成する。
キーが文字を生成する代わりに，後続のキー押下の結果に影響する場合もある（ “<span class="qterm">デッドキー</span>” ）。
多くの表記体系は，キーボードに納まり切らない多数の文字を備えるので、キーストロークの並びを文字の並びに変換する，より複雑な
<dfn id="input-methods">IME</dfn>
（ input-methods ）に頼らなければならない。
一部の文字の入力には特殊な修飾キーを要する言語もある。
自明でない入力の例については
付録 B：<a href="#sec-CharExamples"><b>文字, キーストローク, グリフの例</b></a>
を見よ。
<span lang="en">
In keyboard input, it is not always the case that keystrokes and input characters correspond one-to-one. A limited number of keys can fit on a keyboard. Some keyboards will generate multiple characters from a single keypress. In other cases ('dead keys') a key will generate no characters, but affect the results of subsequent keypresses. Many writing systems have far too many characters to fit on a keyboard and must rely on more complex input methods, which transform keystroke sequences into character sequences. Other languages may make it necessary to input some characters with special modifier keys. See B Examples of Characters, Keystrokes and Glyphs for examples of non-trivial input.
</span></p>

<div class="req"><h5 id="C005">C005<i>[仕様]</i><i>[実装]</i></h5>
<p>
仕様とソフトウェアは、１回のキーストロークから１個の文字が得られることや,
（修飾キーを伴うものも含め）１個の文字が１回のキーストロークで入力されることや,
存在するどのキーボードも同じであることを，要求したり, それに依存しては<em class="rfc2119">ならない</em>。
<span lang="en">
C005 [S] [I] Specifications and software MUST NOT require nor depend on a single keystroke resulting in a single character, nor that a single character be input with a single keystroke (even with modifiers), nor that keyboards are the same all over the world.
</span></p></div>

	</section>
	<section id="sec-CollationUnits">

<h3 title="Units of collation">3.5. 照合の単位</h3>

<p>
整列や検索の際に利用される文字列の比較は、一般的に，符号化された文字に一対一に対応しない単位に基づいている。
その種の文字列比較では、文字並びが，整列順序において固有の位置付けを持つような１個の
<dfn id="collation-unit">照合単位</dfn>
にまとめられたり,
１個の文字が複数個の照合単位に分離されたり,
文字の種々の側面（文字の大小, ダイアクリティカルマークの有無, 等々）が判別されて別々に整列される（多段階 整列）こともある。
<span lang="en">
String comparison as used in sorting and searching is based on units which do not in general have a one-to-one relationship to encoded characters. Such string comparison can aggregate a character sequence into a single collation unit with its own position in the sorting order, can separate a single character into multiple collation units, and can distinguish various aspects of a character (case, presence of diacritics, etc.) to be sorted separately (multi-level sorting).
</span></p>

<p>
加えて、一定量の前処理を要する場合もある。
また，一部の言語（日本語やアラビア語など）では、整列順序が，発音体系や語源†など，より高次の順序付け因子に支配され得る。
照合の手法はまた，アプリケーションごとに様々になり得る。
<span lang="en">
In addition, a certain amount of pre-processing may also be required, and in some languages (such as Japanese and Arabic) sort order may be governed by higher order factors such as phonetics or word roots. Collation methods may also vary by application.
</span></p>

<p class="example"><b>例：</b>
<span class="trans-note">【
日本語では、漢字がその画数／偏／旁に基づいて整列されたり, 語句がその読み仮名の順で整列される場合もある。
】</span>
</p>

<p class="example"><b>例：</b>
スペイン語の伝統的な整列では、文字並び
<span class="qchar">ch</span>
と
<span class="qchar">ll</span>
が不可分な照合単位として扱われる。
スペイン語の整列や一定範囲の日常利用では
<span class="qchar">ch</span>
が１個の単位と見なされる一方で、現在のデジタル符号化法では２個の文字として扱われ，キーボードも同様になる（利用者は <kbd><kbd>c</kbd></kbd>, <kbd><kbd>h</kbd></kbd> を順にタイプする）。
<span lang="en">
EXAMPLE: In traditional Spanish sorting, the character sequences 'ch' and 'll' are treated as atomic collation units. Although Spanish sorting, and to some extent Spanish everyday use, treat 'ch' as a single unit, current digital encodings treat it as two characters, and keyboards do the same (the user types 'c', then 'h').
</span></p>

<p class="example"><b>例：</b>
一部の言語では、字
<span class="qchar">æ</span>
が，２個の連続する照合単位：
<span class="qchar">a</span>,
<span class="qchar">e</span>
と見なされた上で整列される。
<span lang="en">
EXAMPLE: In some languages, the letter 'æ' is sorted as two consecutive collation units: 'a' and 'e'.
</span></p>

<p class="example"><b>例：</b>
大文字・小文字の区別がある用字系（ “bicameral” 用字系）で書かれたテキストの整列では、通常，最初の処理パスでは文字大小の同一視を要し，後続の処理パスの中で文字の大小が仕分けに利用される。
<span lang="en">
EXAMPLE: The sorting of text written in a bicameral script (i.e. a script which has distinct upper and lower case letters) is usually required to ignore case differences in a first pass; case is then used to break ties in a later pass.
</span></p>

<p class="example"><b>例：</b>
整列におけるアクセントマーク付きの字の扱いは、対象の用字系や言語に依存する。
字
<span class="qchar">ö</span>
は、フランス語においては
<span class="qchar">o</span> 
の修飾形として扱われる一方，スウェーデン語では
<span class="qchar">o</span>
と完全に独立な字として扱われる（加えて，整列順も
<span class="qchar">z</span>
の後になる）。
ドイツ語では、一部のアプリケーションが
<span class="qchar">ö</span>
を
<span class="qchar">oe</span>
の並びであるかのように扱う。
<span lang="en">
EXAMPLE: Treatment of accented letters in sorting is dependent on the script or language in question. The letter 'ö' is treated as a modified 'o' in French, but as a letter completely independent from 'o' (and sorting after 'z') in Swedish. In German certain applications treat the letter 'ö' as if it were the sequence 'oe'.
</span></p>

<p class="example"><b>例：</b>
タイ語では、 
<span class="qchar">ไก</span> (U+0E44 U+0E01)
の並びが
<span class="qchar">กไ</span> (U+0E01 U+0E44)
と記されているかのように整列されなければならない。
再順序付けは，概して前処理の段階で行われる。
<span lang="en">
EXAMPLE: In Thai the sequence 'ไก' (U+0E44 U+0E01) must be sorted as if it were written 'กไ' (U+0E01 U+0E44). Reordering is typically done during an initial pre-processing stage.
</span></p>
<p class="example"><b>例：</b>
ドイツ語の辞書では、一般に，
<span class="qchar">ä</span> ／ 
<span class="qchar">ö</span> ／
<span class="qchar">ü</span>
がそれぞれ
<span class="qchar">a</span> ／
<span class="qchar">o</span> ／
<span class="qchar">u</span>
と一緒にされて整列される。
一方で、ドイツ語の電話帳では、一般に，
<span class="qchar">ä</span> ／
<span class="qchar">ö</span> ／
<span class="qchar">ü</span> ／
がそれぞれ
<span class="qchar">ae</span> ／
<span class="qchar">oe</span> ／
<span class="qchar">ue</span>
と綴られているかのように整列される。
このように、利用される照合アルゴリズムは，アプリケーションに依存する。
<span lang="en">
EXAMPLE: German dictionaries typically sort 'ä', 'ö' and 'ü' together with 'a', 'o' and 'u' respectively. On the other hand, German telephone books typically sort 'ä', 'ö' and 'ü' as if they were spelled 'ae', 'oe' and 'ue'. Here the application is affecting the collation algorithm used.
</span></p>

<div class="req"><h5 id="C006">C006<i>[仕様]</i><i>[実装]</i></h5>
<p>
利用者のためにテキストを整列あるいは検索するソフトウェアは、関連の言語やアプリケーションに適切な照合単位と順序付け規則に基づいて，それを行う<em class="rfc2119">べき</em>である。
<span lang="en">
C006 [S] [I] Software that sorts or searches text for users SHOULD do so on the basis of appropriate collation units and ordering rules for the relevant language and/or application.
</span></p></div>

<div class="req"><h5 id="C007">C007<i>[仕様]</i><i>[実装]</i></h5>
<p>
検索や整列が動的に行われる所では、とりわけ多言語環境においては， “関連の言語” が現在の利用者のそれになるように（したがって利用者ごとに異なり得るように）決定される<em class="rfc2119">べき</em>である
<span lang="en">
C007 [S] [I] Where searching or sorting is done dynamically, particularly in a multilingual environment, the 'relevant language' SHOULD be determined to be that of the current user, and may thus differ from user to user.
</span></p></div>

<div class="req"><h5 id="C066">C066<i>[仕様]</i><i>[実装]</i></h5>
<p>
利用者によるテキストの整列や検索が可能なソフトウェアは、照合単位と順序付けのための，別の規則の選択も可能にする<em class="rfc2119">べき</em>である。
<span lang="en">
C066 [S] [I] Software that allows users to sort or search text SHOULD allow the user to select alternative rules for collation units and ordering.
</span></p></div>

<div class="req"><h5 id="C008">C008<i>[仕様]</i><i>[実装]</i></h5>
<p>
整列／検索アルゴリズムの仕様と実装は、テキストが Unicode  のどの文字を含んでいても，適応する<em class="rfc2119">べき</em>である。
<span lang="en">
C008 [S] [I] Specifications and implementations of sorting and searching algorithms SHOULD accommodate text that contains any character in Unicode.
</span></p></div>

<p>
したがって，テキストに規則の対象外の Unicode 文字が含まれている場合でも、最低でも，照合アルゴリズムが正常に機能し続けることが求められることに注意。
これは、すべての用字系に対応できるような複雑なアルゴリズムの完全な実装を要求するものではない。
この要件を満たす有用な方法として、すべての Unicode 文字が対象にされる，既定の照合アルゴリズムの適用が挙げられる。
<span lang="en">
Note that this requires, as a minimum, that a collation algorithm does not break down if the text contains Unicode characters that are not covered by its rules. It does not necessarily require full implementation of complex algorithms for all scripts. One useful way of satisfying the requirement is to apply a default collation algorithm that covers all Unicode characters.
</span></p>

<p>
ISO/IEC 14651
<a href="#iso14651">[ISO/IEC 14651]</a>
および 
Unicode 技術報告 #10, Unicode 照合アルゴリズム
<a href="#UTR10">[UTR #10]</a>
は、大多数の言語に適応する照合のためのモデルを述べ，既定の照合順序を提供している。
それらは照合とその実装の指針を提供する，適切な基準になる。
いかなる文字が含まれようとも，予測可能な文字列の順序付けと比較を確保するために、既定の照合順序を，特定のロケールにあつらえの規則と組み合わせて利用できる。
<span lang="en">
ISO/IEC 14651 [ISO/IEC 14651] and Unicode Technical Report #10, the Unicode Collation Algorithm [UTR #10], describe a model for collation that accommodates most languages and provide a default collation order. They are appropriate references for collation and provide implementation guidelines. The default collation order can be used in conjunction with rules tailored for a particular locale to ensure a predictable ordering and comparison of strings, whatever characters they include.
</span></p>

	</section>
	<section id="sec-Storage">

<h3 title="Units of storage">3.6. 格納の単位</h3>

<p>
コンピュータにおけるデータの格納と通信は、ビットやバイト（オクテットとも呼ばれる 8-bit 単位）などの，情報の物理的な 格納／交換 の単位に依拠する。
仕様や実装にありがちな誤りは、物理的な格納単位に基づく文字の同等性比較である。
文字とその種の格納単位との間の対応関係は、実際には極めて複雑であり，
次の 4.1 節：<a href="#sec-Digital"><b>文字符号化法</b></a>
にて論じられる。
<span lang="en">
Computer storage and communication rely on units of physical storage and information interchange, such as bits and bytes (8-bit units, also called octets). A frequent error in specifications and implementations is the equating of characters with units of physical storage. The mapping between characters and such units of storage is actually quite complex, and is discussed in the next section, 4.1 Character Encoding.
</span></p>

<div class="req"><h5 id="C009">C009<i>[仕様]</i><i>[実装]</i></h5>
<p>
仕様, ソフトウェア, 内容は、文字と物理的な格納単位との一対一の対応関係に依存したり，それを要求をしては<em class="rfc2119">ならない</em>。
<span lang="en">
C009 [S] [I] Specifications, software and content MUST NOT require or depend on a one-to-one relationship between characters and units of physical storage.
</span></p></div>

	</section>
	<section id="sec-PerceptionsOutro">

<h3 title="Summary">3.7. 要約</h3>

<p id="def-text">
語 文字は，種々の文脈の下で異なる仕方で利用されるので、それらの文脈の外で利用された際に，しばしば混同が生じる。
テキストのデジタル表現の文脈の下では、
<dfn id="character">文字</dfn>
は，テキストの小さな論理単位として定義し得るものになる。
しかる後、
<dfn id="text">テキスト</dfn>
が文字の並びとして定義される。
その種の非公式的な定義は、多くの場合，共通の理解を醸成または獲得するに足るものではあるが、詳細が問題にされ始めるや否や，容易に誤解の源になる。
実際に機能する
仕様／プロトコル実装／末端利用者のためのソフトウェア
を書くためには、これらの誤解が生じ得ることについて理解しておくことがとても重要である。
<span lang="en">
The term character is used differently in a variety of contexts and often leads to confusion when used outside of these contexts. In the context of the digital representations of text, a character can be defined as a small logical unit of text. Text is then defined as sequences of characters. While such an informal definition is sufficient to create or capture a common understanding in many cases, it is also sufficiently open to create misunderstandings as soon as details start to matter. In order to write effective specifications, protocol implementations, and software for end users, it is very important to understand that these misunderstandings can occur.
</span></p>

<p>
この 3 節：<a href="#sec-Perceptions"><b>文字の知覚</b></a>
では、語 “<span class="qterm">文字</span>” とは必ずしも一致しない単位 —
音素, グリフ, 照合など
— のための用語について論じた。
次の 4.1 節：<a href="#sec-Digital"><b>文字符号化法</b></a>では、
<dfn id="units-of-encoding">符号化の単位</dfn>
（符号位置, 符号単位, バイト）を精確に定義するために，
“<span class="qterm">文字</span>” に代わって利用されるべき用語について述べる。
<span lang="en">
This section, 3 Perceptions of Characters, has discussed terms for units that do not necessarily overlap with the term 'character', such as phoneme, glyph, and collation unit. The next section, 4.1 Character Encoding, lists terms that should be used rather than 'character' to precisely define units of encoding (code point, code unit, and byte).
</span></p>

<div class="req"><h5 id="C010">C010<i>[仕様]</i></h5>
<p>
語 “<span class="qterm">文字</span>” を利用する仕様は、それが意図する意味を定義しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C010 [S] When specifications use the term 'character' the specifications MUST define which meaning they intend.
</span></p></div>

<div class="req"><h5 id="C067">C067<i>[仕様]</i></h5>
<p>
仕様は、可能なら，一般的な語 “文字” の代わりに，より明確な用語を用いる<em class="rfc2119">べき</em>である。
<span lang="en">
C067 [S] Specifications SHOULD use specific terms, when available, instead of the general term 'character'.
</span></p></div>

	</section>
</section>
<section id="sec-Characters">

<h2 title="Digital Encoding of Characters">4. 文字のデジタル符号化法</h2>

	<section id="sec-Digital">

<h3 title="Character Encoding">4.1. 文字符号化法</h3>

<p>
WWW においては、文字は，コンピュータ利用環境と同様に，どう利用するにしても 符号化されなければならない。
テキストを符号化するために、多種多様な文字符号化法が考案されている。
文字符号化法とは、概ね，利用者が取り扱う文字の並びとコンピュータが取り扱うビットの並びとの間の対応関係として説明される。
<span lang="en">
On the WWW, as in any computing environment, characters must be encoded to be of any use. To achieve text encoding, a large variety of character encodings have been devised. Character encodings can loosely be explained as mappings between the character sequences that users manipulate and the sequences of bits that computers manipulate.
</span></p>

<p>
与えられたテキスト符号化法の複雑さと，コンピュータの時代を通して考案されてきた 文字符号化法のための多種多様な仕組み<!--1-->の下では、より公式的な符号化処理の記述が有用になる。
テキストの符号化処理を定義する過程は、次のように述べられる（より詳細な記述は “Unicode 技術報告 #17: 文字符号化モデル”
<a href="#UTR17">[UTR #17]</a>
を参照）：
<span lang="en">
Given the complexity of text encoding and the large variety of mechanisms for character encoding invented throughout the computer age, a more formal description of the encoding process is useful. The process of defining a text encoding can be described as follows (see Unicode Technical Report #17: Character Encoding Model [UTR #17] for a more detailed description):
</span></p>

<ol>
	<li id="def-repertoire">
<p>
まず、符号化の対象になる文字<span class="trans-note">【抽象文字】</span>の集合が選定<!--0-->される。
対象の文字は、１つ以上の対象言語の下で，テキストを表記し, 種々のテキスト処理が効率的に行えるように、実用的に選ばれる。
それらは、利用者が字その他の文字として知覚しているものとは，精確に対応していないかもしれない。
この文字の集合は
<dfn id="repertoire">レパートリ</dfn>
呼ばれる。
<span lang="en">
A set of characters to be encoded is identified. The characters are pragmatically chosen to express text and to efficiently allow various text processes in one or more target languages. They may not correspond precisely to what users perceive as letters and other characters. The set of characters is called a repertoire.
</span></p>

<p class="trans-note">【
レパートリは、通例は固定的（ “closed” ）にされるが，一般的には拡張も可能（ “open” ）にされ得る。
】</p>

	</li>
	<li id="def-CCS">
<p>
次に、レパートリ内の各文字は，
<dfn id="code-point">符号位置</dfn>
と呼ばれる （数学的, 抽象的な）非負整数に結び付けられる。
その結果、
<dfn id="coded-character-set">符号化文字集合</dfn>
（ <abbr title="Coded Character Set">CCS</abbr> ）と呼ばれる，レパートリから非負整数の集合への対応関係が得られる。
（符号位置（ code point ）は、
<dfn id="character-number">character-number</dfn>
あるいは
<dfn id="code-position">code-position</dfn>
と呼ばれることもある。）
<span lang="en">
Each character in the repertoire is then associated with a (mathematical, abstract) non-negative integer, the code point (also known as a character number or code position). The result, a mapping from the repertoire to the set of non-negative integers, is called a coded character set (CCS).
</span></p>

<p class="trans-note">【
逐語訳的には，“code point” ／ “code position” の対訳はそれぞれ “符号点” ／ “符号位置<!--0-->” になる所であるが、Unicode の公式の対訳表に倣い， “code point” の対訳には “符号位置<!--0-->” を採用している。
】</p>

	</li>
	<li id="def-CEF">
コンピュータ上での利用に適するような，
<span class="trans-note">【固定的な】</span>
基本データ型（バイトや 16-bit などの格納単位）が選定<!--0-->された上で、
<dfn id="character-encoding-form">文字符号化形式</dfn>
（ <abbr title="Character Encoding Form">CEF</abbr> ）と呼ばれる，［
符号化文字集合（
<abbr title="Coded Character Set">CCS</abbr>
）の抽象的整数を，［
<dfn id="code-units">符号単位</dfn>
と呼ばれる，基本データ型による値
］の並びへ符号化するための写像
］が利用される。
文字符号化形式は、ごく単純なもの（例えば， <abbr>CCS</abbr> の整数をコンピュータ プラットフォームで選定されたデータ型による，整数の自然な表現に符号化するもの）から，いくらでも複雑なもの（例えば，１個の抽象的整数を符号化した結果が、可変個の符号単位からなり，そのそれぞれの値がその整数の自明でない関数で与えられるもの）にもなり得る。
<span lang="en">
To enable use in computers, a suitable base datatype is identified (such as a byte, a 16-bit unit of storage or other) and a character encoding form (CEF) is used, which encodes the abstract integers of a coded character set (CCS) into sequences of the code units of the base datatype. The character encoding form can be extremely simple (for instance, one which encodes the integers of the CCS into the natural representation of integers of the chosen datatype of the computing platform) or arbitrarily complex (a variable number of code units, where the value of each unit is a non-trivial function of the encoded integer).
</span></li>
	<li id="def-CES">
<p>
最後に，バイト単位の伝達／格納のために、
<dfn id="character-encoding-scheme">文字符号化スキーム</dfn>
（ <abbr title="Character Encoding Scheme">CES</abbr>,
<dfn id="serialization-scheme">直列化スキーム</dfn>
とも呼ばれる）が利用される。
文字符号化スキームとは、文字符号化形式（
<abbr title="Character Encoding Form">CEF</abbr>
）の符号単位から well-defined なバイト列への対応関係であり、データ型が複数バイトに基づく場合に必要になるバイト順の指定や，一部の場合には，複数の文字符号化スキーム（例えば ISO 2022 ）の下での符号単位ごとのスキームの切替も織り込まれる。
</p>
<p class="trans-note">【
“well-defined” — この語の解釈<!--0-->は注意を要する：
例えば対応関係が一対多で結果が一意的に定まらないものであっても，常に，元データを一意的に復元可能ならば、 well-defined と見なされ得るかもしれない。
逆に、個々の対応関係が一対一であっても，全体として元データを一意的に復元できないケースはあり得るので，その種のものは well-defined とは見なされないであろう（レパートリの拡張が許容されている場合は、拡張された際の well-defined の維持も考慮されるべきであろう）。
】</p>

<p>
文字符号化スキームと, それに伴って利用される符号化文字集合の組は，
<dfn id="character-encoding">文字符号化法</dfn>
（ “character encoding” ）と呼ばれ、
<abbr title="Internet Assigned Numbers Authority">IANA</abbr>
charset 識別子などの，一意的な識別子により識別される。
テキストを表現するバイト列と
<code class="keyword">charset</code>
識別子により識別される文字符号化法が与えられれば、原理的には，テキストの文字並びを一義的に復元できるようになる。
</p>

<p class="trans-note">【
“文字符号化法” — 訳語としては CES の同義語として用いられることが多いが，この訳ではこの対訳を用いる†。
上に述べられているように， CCS, CEF, CES をひっくるめた概念と捉えればよいであろう（文脈によっては “文字符号化処理” 等々）。
CCS と CEF が１つに固定された文脈下では（例えば現今のウェブプラットフォームの大部分は（その内部処理モデルが） Unicode と UTF-16 （ 16-bit 符号単位）に基づいている）， CES と同義になる。
<small >（ † — 実際の所、一般には “（文字<!--0-->）エンコーディング”  以外の対訳は見当たらない。
他の “encoding …” については、“符号化<!--0-->…” の対訳があるにも関わらず。）</small>
<!--
語義的には “-スキーム” （ “枠組み”, “体系” ）の方が相応しい？
“文字符号化体系”
-->
】</p>

<p lang="en">
To enable transmission or storage using byte-oriented devices, a serialization scheme or character encoding scheme (CES) is next used. A character encoding scheme is a mapping of the code units of a character encoding form (CEF) into well-defined sequences of bytes, taking into account the necessary specification of byte-order for multi-byte base datatypes and including in some cases switching schemes between the code units of multiple character encoding schemes (an example is ISO 2022). A character encoding scheme, together with the coded character sets it is used with, is called a character encoding, and is identified by a unique identifier, such as an IANA charset identifier. Given a sequence of bytes representing text and a character encoding identified by a charset identifier, one can in principle unambiguously recover the sequence of characters of the text.
</p>

	</li>
</ol>

<hr>

<p class="note"><b>注記：</b>
用語
“<span class="qterm">charset</span>”
および文字符号化法についての更なる詳細についての論は、
4.4.2 節：<a href="#sec-EncodingIdent"><b>文字符号化法の識別</b></a>
を見よ。
<span lang="en">
NOTE: See 4.4.2 Character encoding identification for a discussion of the term 'charset' and further details on character encodings.
</span></p>

<p class="note"><b>注記：</b>
語
“<span class="qterm">encoding</span>”
（符号化法<!-- ／符号化処理 -->）は、文字を符号化する実際の処理を指すときや，その処理を遂行するための特定の方法を指すときにも利用されることがあり，いくぶんの曖昧さを孕む（例えば “このファイルは X encoding である”
<span class="trans-note">【
“このファイルは， X という名称から識別される符号化法に規定されている符号化の方法により符号化処理が施された結果を内容とする”
】</span>）。
これらの用法の区別は、その違いが念頭にありさえすれば，通常は文脈から推定し得るものになる。
<span lang="en">
NOTE: The term 'character encoding' is somewhat ambiguous, as it is sometimes used to describe the actual process of encoding characters and sometimes to denote a particular way to perform that process (as in "this file is in the X character encoding"). Context normally allows the distinction of those uses, once one is aware of the ambiguity.
</span></p>

<p class="note"><b>注記：</b>
与えられた
文字の並び, および
“<span class="qterm">文字符号化法</span>”
から，常に同じ並びのバイトが生成されるとは限らない。
特に， ISO 2022 に基づく符号化法では、符号化処理の過程でいくつかの選択肢をとり得る。
<span lang="en">
NOTE: Given a sequence of characters, a given 'character encoding' may not always produce the same sequence of bytes. In particular for encodings based on ISO 2022, there may be choices available during the encoding process.
</span></p>

<p>
最も単純な場合では、例えば US-ASCII
<a href="#iso646">[ISO/IEC 646]</a>
や ISO-8859-1 のように，符号化処理 全体が文字からバイトへの自明な一対一の対応関係として、一段で済ませられる。
<span lang="en">
In very simple cases, the whole encoding process can be collapsed to a single step, a trivial one-to-one mapping from characters to bytes; this is the case, for instance, for US-ASCII [ISO/IEC 646] and ISO-8859-1.
</span></p>

<p id="Unicode_Encoding_Form">
UTF-8 ／ UTF-16 ／ UTF-32
に符号化されたテキストは
<dfn id="Unicode-encoding">Unicode 符号化形式</dfn>
と呼ばれる。
<span lang="en">
Text is said to be in a Unicode encoding form if it is encoded in UTF-8, UTF-16 or UTF-32.
</span></p>

	</section>
	<section id="sec-Transcoding">

<h3 title="Transcoding">4.2. 符号変換</h3>

<p id="def-transcoding">
テキストをある
<a href="#def-CES">文字符号化法</a>
<span class="trans-note">【で符号化された状態】</span>
から別のそれへ変換する処理は
<dfn id="transcoding">符号変換</dfn>
と呼ばれる。
符号変換器は、テキストを構文解析せずに、文字符号化法のレベルでのみ働く。
従って、数値文字参照（
4.6 節：<a href="#sec-Escaping"><b>文字エスケープ法</b></a>
を見よ）などの
<a href="#sec-Escaping">文字エスケープ</a>
を扱うこともなく，埋め込まれている文字符号化法の情報（例えば XML 宣言や HTML の <code>meta</code> 要素の中の）にも対応しない。
<span lang="en">
Transcoding is the process of converting text from one character encoding to another. Transcoders work only at the level of character encoding and do not parse the text; consequently, they do not deal with character escapes such as numeric character references (see 4.6 Character Escaping) and do not adjust embedded character encoding information (for instance in an XML declaration or in an HTML meta element).
</span></p>

<p class="note"><b>注記：</b>
符号変換は
一対一, 多対一, 一対多, 多対多
いずれの対応関係も孕み得る。
加えて，文字の格納順序も符号化法の間で変わり得る。
Unicode 符号化形式のような一部のものは，論理順と規定する一方、視覚順を利用するものもある。
符号化法には、ダイアクリティカルマークを基底文字の前に置くよう規定されているものもあれば，後に置くようにするものもある。
これらの文字の並べ方の相違があるため、符号変換は，<!--  XYZ から yxz のように  -->順序の入れ替わりも孕む：
<span lang="en">
NOTE: Transcoding may involve one-to-one, many-to-one, one-to-many or many-to-many mappings. In addition, the storage order of characters varies between encodings: some, such as the Unicode encoding forms, prescribe logical ordering, while others use visual ordering; among encodings that have separate diacritics, some prescribe that they be placed before the base character, some after. Because of these differences in sequencing characters, transcoding may involve reordering: thus XYZ may map to yxz.
</span></p>

<div class="example"><p><b>例：</b>
最初の例は “ロシア語” を意味するロシア語の単語
“<samp>Русский</samp>”
を Unicode の UTF-16 符号化法から ISO 8859-5 符号化法へ符号変換した場合を示している：
<span lang="en">
EXAMPLE: This first example shows the transcoding of the Russian word 'Русский' meaning 'Russian' (language), from the UTF-16 encoding of Unicode to the ISO 8859-5 encoding:
</span></p>

<table>
<caption>
ISO 8859-5 から UTF-16 への対応関係
<span lang="en">
table displaying the mapping from ISO 8859-5 to UTF-16
</span></caption>
<tbody>
<tr><th colspan="2">UTF-16<th colspan="2">ISO 8859-5
<tr><th>符号単位<th>（短縮）文字名<th>符号単位<th>（短縮）文字名<tr><td>0420<td>CAPITAL ER<td>C0<td>CAPITAL ER<tr><td>0443<td>SMALL U<td>E3<td>SMALL U<tr><td>0441<td>SMALL ES<td>E1<td>SMALL ES<tr><td>0441<td>SMALL ES<td>E1<td>SMALL ES<tr><td>043A<td>SMALL KA<td>DA<td>SMALL KA<tr><td>0438<td>SMALL I<td>D8<td>SMALL I<tr><td>0439<td>SMALL SHORT I<td>D9<td>SMALL SHORT I</tbody></table>

</div>

<div class="example"><p><b>例：</b>
次の例はずっと複雑で、 “平和” を意味するアラビア語の単語
“<samp>&#x0627;&#x0644;&#x0633;&#x0644;&#x0627;&#x0645;</samp>”
<!--

السلام

-->
が、 IBM CP864 符号化法により視覚的に順序付けられ, 文脈付けられた状態から， Unicode の UTF-16 符号化法への符号変換を示す：
<span lang="en">
EXAMPLE: This second example shows a much more complex case, where the Arabic word '&#x0627;&#x0644;&#x0633;&#x0644;&#x0627;&#x0645;', meaning 'peace', is transcoded from the visually-ordered, contextualized encoding IBM CP864 to the UTF-16 encoding of Unicode:
</span></p>

<table>
<caption>
UTF-16 から IBM CP864 への対応関係
<span lang="en">
table displaying the mapping from UTF-16 to IBM CP864
</span></caption>
<tbody>
<tr><th colspan="2">IBM CP864<th colspan="2">UTF-16
<tr><th>符号単位<th>（短縮）文字名<th>符号単位<th>（短縮）文字名<tr><td>EF<td>FINAL MEEM<td>0627<td>ALEF<tr><td>9E<td>MEDIAN LAM-ALEF<td>0644<td>LAM<tr><td>D3<td>MEDIAN SEEN<td>0633<td>SEEN<tr><td>E4<td>MEDIAN LAM<td>0644<td>LAM<tr><td>C7<td>INITIAL ALEF<td>0627<td>ALEF<tr><td><td><td>0645<td>MEEM
</tbody></table>

<p>
文字の順序が反対にされていることに注意。
CP864 の１個の LAM-ALEF が UTF-16 においては LAM, ALEF の並びに変換され, また
元の符号化法の，文脈に応じた変形（頭字／中字／尾字（ initial ／ median ／ final ））は、目的の符号化法においては，総称的な文字に変換されている。
<span lang="en">
Notice that the order of the characters has been reversed, that the single LAM-ALEF in CP864 has been converted to a LAM ALEF sequence in UTF-16, and that the contextual variants (initial, median or final) in the source encoding have been converted to generic characters in the target encoding.
</span></p></div>

	</section>
	<section id="sec-RefProcModel">

<h3 title="Reference Processing Model">4.3. 基準処理モデル</h3>

<p id="def-char-data">
Internet 上の大部分のプロトコルやデータ形式，特に，最も重要なウェブデータ形式［
HTML, CSS, XML
］は、テキストに基づいている。
それらのデータ形式は、テキストのみからなるが、
<dfn id="plain-text">プレーンテキスト</dfn>
（“素のテキスト” — マークアップやプログラミング言語の文脈下にないテキスト）自体が供するものに新たな機能性を加えるために、関連の仕様により，テキストに構造が持ち込まれ, 一定の構成子に意味が与えられる。
HTML と XML は
<dfn id="markup-languages">マークアップ言語</dfn>
である。
すなわち、文書は全体がテキストのみからなるものと規定されつつ，このテキストを
<dfn id="markup">マークアップ</dfn>
と
<dfn id="character-data">文字データ</dfn>
に分離するための規約も伴なわれる。
XML 1.0 仕様
<a href="#xml10">[XML 1.0]</a>
<a href="http://www.w3.org/TR/2004/REC-xml-20040204/#syntax">2.4 節</a>
からの引用：
<span lang="en">
Many Internet protocols and data formats, most notably the very important Web formats HTML, CSS and XML, are based on text. In those formats, everything is text but the relevant specifications impose a structure on the text, giving meaning to certain constructs so as to obtain functionality in addition to that provided by plain text (text that is not in the context of markup or a programming language). HTML and XML are markup languages, defining documents entirely composed of text but with conventions allowing the separation of this text into markup and character data. Citing from the XML 1.0 specification [XML 1.0], section 2.4:
</span></p>

<p>
<q cite="#xml10">
テキスト内容は文字データとマークアップの混成である…（中略）マークアップでないすべてのテキストは、文書の文字データをなす。
</q>
<span lang="en">
"Text consists of intermingled character data and markup. [...] All text that is not markup constitutes the character data of the document."
</span></p>

<p>
この節では、もっぱら
<a href="#def-text">テキスト</a>
（すなわち，文字の並び）の取り扱いについて焦点をあてる。
<span lang="en">
For the purposes of this section, the important aspect is that everything is text, that is, a sequence of characters.
</span></p>

<p>
<dfn id="textual-data-object">テキストデータオブジェクト</dfn>
とは、全体がテキストからなるプロトコルメッセージや文書であるか, あるいは
その中の，格納／取得など 外部とのやりとりの目的で別々に扱われるテキストを意味する。
例えば， XML の外部解析対象実体や
テキストの MIME entity body<!--  -->
<a href="#MIME-entity">[MIME-entity]</a>
などが例として挙げられる。
<span lang="en">
A textual data object is a whole text protocol message or a whole text document, or a part of it that is treated separately for purposes of external storage and retrieval. Examples include external parsed entities in XML and textual MIME entity bodies [MIME-entity].
</span></p>

<div class="req"><h5 id="C013">C013<i>[仕様]</i><i>[内容]</i></h5>
<p>
プロトコル／データ形式 の仕様に定義されるテキストデータオブジェクトは、<em>単独の</em>文字符号化法に統一されなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C013 [S] [C] Textual data objects defined by protocol or format specifications MUST be in a single character encoding.
</span></p></div>

<p>
これは， ISO 2022 のような文字集合 切替スキームが利用できないことを意味するわけではないことに注意。
その種のスキームでは、１つの文字符号化法の下で文字集合の切替が遂行される。
<span lang="en">
Note that this does not imply that character set switching schemes such as ISO 2022 cannot be used, since such schemes perform character set switching within a single character encoding.
</span></p>

<p id="def-ref-proc-model">
草創期の頃から、ウェブでは
<dfn id="Reference-Processing-Model">基準処理モデル</dfn>
の開発が行われてきた。
初めて述べられたのは， HTML を対象にした RFC 2070
<a href="#rfc2070">[RFC 2070]</a>
である。
このモデルは後に， XML と CSS に取り込まれた。
上に述べられたように，それはテキストに基づくどんなデータ形式／プロトコルにも適用し得る。
基準処理モデルの本質は、 Unicode を共通の基準に利用する所にある。
仕様による基準処理モデルの利用は、しかしながら，実装が実際に Unicode を利用することを要求するわけではない。
実装には，その処理がモデルの記述に沿って行われるかのようにふるまうことのみが、要件として課される。
また、この文書は語 基準<em>処理</em>モデルを利用し，その処理の内容を通して その特性を述べるが、明示的に処理モデルを規定しない仕様にも，このモデルは適用される。
<span lang="en">
Since its early days, the Web has seen the development of a Reference Processing Model, first described for HTML in RFC 2070 [RFC 2070]. This model was later embraced by XML and CSS. It is applicable to any data format or protocol that is text-based as described above. The essence of the Reference Processing Model is the use of Unicode as a common reference. Use of the Reference Processing Model by a specification does not, however, require that implementations actually use Unicode. The requirement is only that the implementations behave as if the processing took place as described by the Model. Also, while this document uses the term Reference Processing Model and describes its properties in terms of processing, the model also applies to specifications that do not explicitly define a processing model.
</span></p>

<div class="req"><h5 id="C014">C014<i>[仕様]</i></h5>
<p>
テキスト処理を孕むすべての仕様は、その処理を次に示す
<a href="#sec-RefProcModel">基準処理モデル</a>
に従って指定しなければ<em class="rfc2119">ならない</em>：
<span lang="en">
C014 [S]All specifications that involve processing of text MUST specify the processing of text according to the Reference Processing Model, namely:
</span></p>

<ol>
	<li>
仕様は、バイトや
<a href="#def-glyph">グリフ</a>
ではなく， Unicode 文字を通して、テキストを定義しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
Specifications MUST define text in terms of Unicode characters, not bytes or glyphs.
</span></li>
	<li>
それらのテキストデータオブジェクトに対する仕様は、 Unicode 符号化形式に符号変換し得るものであれば，どのような文字符号化法の利用も許容しても<em class="rfc2119">よい</em>。
<span lang="en">
For their textual data objects specifications MAY allow use of any character encoding which can be transcoded to a Unicode encoding form.
</span></li>
	<li>
<p>
仕様は、一部の文字符号化法を否認または廃止して，他のものを義務付けても<em class="rfc2119">よい</em>。
実際の文字符号化法に関係なく、そのふるまいは，次で与えられる処理が<em >行われたかのように</em>指定されなければ<em class="rfc2119">ならない</em>：
<span lang="en">
Specifications MAY choose to disallow or deprecate some character encodings and to make others mandatory. Independent of the actual character encoding, the specified behavior MUST be the same as if the processing happened as follows:
</span></p>

		<ul>
			<li>
その仕様を実装するアプリケーションにより受信されたテキストデータオブジェクトの文字符号化法<!-- CEF -->は， Unicode 文字の並びとして決定されなければ<em class="rfc2119">ならず</em>、データオブジェクトは， Unicode 文字の並びとして解釈されなければ<em class="rfc2119">ならない</em>
—
これは、そのデータオブジェクトから，何らかの
<a href="#Unicode_Encoding_Form">Unicode 符号化形式</a>
への［
必要なら文字符号化法ラベルの補正も伴われた，その Unicode 符号化形式で受信するような
］
<a href="#def-transcoding">符号変換</a>
と等価でなければ<em class="rfc2119">ならない</em>。
<span lang="en">
The character encoding of any textual data object received by the application implementing the specification MUST be determined and the data object MUST be interpreted as a sequence of Unicode characters - this MUST be equivalent to transcoding the data object to some Unicode encoding form, adjusting any character encoding label if necessary, and receiving it in that Unicode encoding form.
</span></li>
			<li>
すべての処理は、この Unicode 文字の並びの上で行われなければ<em class="rfc2119">ならない</em>。
<span lang="en">
All processing MUST take place on this sequence of Unicode characters.
</span></li>
			<li>
テキストがアプリケーションから出力される場合、 Unicode 文字の並びは，仕様で許容されているものから選定された文字符号化法を用いて，符号化されなければ<em class="rfc2119">ならない</em>。
<span lang="en">
If text is output by the application, the sequence of Unicode characters MUST be encoded using a character encoding chosen among those allowed by the specification.
</span></li>
		</ul>
	</li>
	<li>
仕様が，複数のテキストデータオブジェクトを孕んでいる場合（例えば外部解析対象実体を参照している XML 文書など）、これらのデータオブジェクトそれぞれに異なる文字符号化法が選ばれても<em class="rfc2119">よい</em>。
いずれにせよ、すべてのテキストデータオブジェクトに，
<a href="#sec-RefProcModel">基準処理モデル</a>
が適用されなければ<em class="rfc2119">ならない</em>。
<span lang="en">
If a specification is such that multiple textual data objects are involved (such as an XML document referring to external parsed entities), it MAY choose to allow these data objects to be in different character encodings. In all cases, the Reference Processing Model MUST be applied to all textual data objects.
</span></li>
</ol>

</div>


<p class="note"><b>注記：</b>
XML 1.0 仕様
<a href="#xml10">[XML 1.0]</a>
のアプリケーションを定義するすべての仕様は、自動的に，この基準処理モデルを継承する。
XML では、仕様全体が Unicode 文字を通して定義されており，解析対象実体には他の文字符号化法も許容されつつ，<span class="trans-note">【
処理モデルにおいては
】</span>
UTF-8 ／ UTF-16 文字符号化法の利用が要求されている。
<span lang="en">
NOTE: All specifications which define applications of the XML 1.0 specification [XML 1.0] automatically inherit this Reference Processing Model. XML is entirely defined in terms of Unicode characters and requires the UTF-8 and UTF-16 character encodings while allowing any other character encoding for parsed entities.
</span></p>

<p class="note"><b>注記：</b>
仕様において Unicode 符号化形式でない文字符号化法が許容される場合、実装者は，その種の符号化法の文字と Unicode 文字との対応関係が、実施において
<a href="#def-transcoding">符号変換</a>
に利用されるソフトウェアに依存することに，留意するべきである。
その種の不一致については、例えば 日本語 XML Profile
<a href="#XML_Japanese_profile">[XML Japanese Profile]</a>
を参照されたし。
<span lang="en">
NOTE: When specifications choose to allow character encodings other than Unicode encoding forms, implementers should be aware that the correspondence between the characters of such encodings and Unicode characters may in practice depend on the software used for transcoding. See the Japanese XML Profile [XML Japanese Profile] for examples of such inconsistencies.
</span></p>

<div class="req"><h5 id="C070">C070<i>[仕様]</i></h5>
<p>
仕様は、全範囲（すなわち， U+0000 〜 U+10FFFF ）の <em>いかなる</em> Unicode 
<a href="#def-CCS">符号位置</a>
も，除外する<em class="rfc2119">べき</em>でない
<span lang="en">
C070 [S] Specifications SHOULD NOT arbitrarily exclude code points from the full range of Unicode code points from U+0000 to U+10FFFF inclusive.
</span></p></div>

<div class="req"><h5 id="C077">C077<i>[仕様]</i></h5>
<p>
仕様は、 U+10FFFF を越える符号位置を許容しては<em class="rfc2119">ならない</em>。
<span lang="en">
C077 [S] Specifications MUST NOT allow code points above U+10FFFF.
</span></p></div>

<p>
Unicode の中には、内部利用（非文字など）や特殊な機能のための符号位置（代用符号位置など）が一部含まれている。
<span lang="en">
Unicode contains some code points for internal use (such as noncharacters) or special functions (such as surrogate code points).
</span></p>

<div class="req"><h5 id="C079">C079<i>[仕様]</i></h5>
<p>
仕様は、 Unicode により内部利用のために予約されている符号位置の利用を許容する<em class="rfc2119">べき</em>でない。
<span lang="en">
C079 [S]Specifications SHOULD NOT allow the use of codepoints reserved by Unicode for internal use.
</span></p></div>

<div class="req"><h5 id="C078">C078<i>[仕様]</i></h5>
<p>
仕様は、代用符号位置の利用を許容しては<em class="rfc2119">ならない</em>。
<span lang="en">
C078 [S] Specifications MUST NOT allow the use of surrogate code points.
</span></p></div>

<p>
相当の理由も無く，一部の符号位置を除外することは、 W3C の世界共通のアクセス性の目標と競合する。
符号位置の除外は、利用者その他のコミュニティにとって重要になり得る 一部の用字系を利用できなくする。
例えば、大きな理由も無く，基本多言語面より先の符号位置を除外するとする裁定や,
符号位置を US-ASCII ／ Latin-1 レパートリに制限することは適切でない。
また、 Unicode 標準では，ソフトウェアが どの符号位置に対しても壊れないことが要求されていることにも注意。
<span lang="en">
Excluding code points without good reason conflicts with the W3C goal of universal accessibility. Excluding code points would prevent some scripts from being used which may be important to a user community or communities. For example, without strong reasons to do so, decisions to exclude code points above the Basic Multilingual Plane or to limit code points to the US-ASCII or Latin-1 repertoire are inappropriate. Also, please note that the Unicode Standard requires software to not corrupt any code points.
</span></p>

<p>
文字を除外する 合法的かつ恣意的でない理由としては、
<cite>Unicode in XML and other Markup Languages</cite>
<a href="#UXML">[UXML]</a>
が挙げられる。
そこでは、一部の文字の利用が次のような理由で抑止されている：
<span lang="en">
Other examples of legitimate and non-arbitrary reasons to exclude characters can be seen in Unicode in XML and other Markup Languages [UXML], where the use of certain characters is discouraged for reasons such as:
</span></p>

<ul>
	<li>
それらは Unicode 標準により廃止予定にされた。
<span lang="en">
They are deprecated in the Unicode Standard.
</span></li>
	<li>
それらは 追加のデータなしにサポートされ得ない。
<span lang="en">
They cannot be supported without additional data.
</span></li>
	<li>
それらは マークアップの方がより上手く取り扱える。
<span lang="en">
They are better handled by markup.
</span></li>
	<li>
それらは 等価なマークアップと競合する。
<span lang="en">
They conflict with equivalent markup.
</span></li>
</ul>

	</section>
	<section id="sec-Encodings">

<h3 title="Choice and Identification of Character Encodings">4.4. 文字符号化法の選定と識別</h3>

<p>
符号化されたテキストは，符号化法を知ることなしに 解釈／処理 を<em >行えない</em>ので、文字符号化法（
4.1 節：<a href="#sec-Digital"><b>文字符号化法</b></a>
を見よ）は、テキストが 交換／格納／処理されるような既知のあらゆる所で，決定的に重要になる。
以下では、利用する
“<span class="qterm">文字符号化法</span>”
は，文脈に依存して
<a href="#def-CEF">文字符号化形式 (CEF)</a>
または
<a href="#def-CES">文字符号化スキーム (CES)</a>
のいずれかを意味する。
テキストがバイトストリームとして伝送／格納されるときは（例えばプロトコルやファイルシステムの中で）、正しく解釈されることを確保するため，
<a href="#def-CES">CES</a>
の指定が要求される。
API などの文脈では、複数バイトのバイト順は，環境（概して，プロセッサアーキテクチャ）から指定されるので、
<a href="#def-CEF">CEF</a>
の指定で足る。
<span lang="en">
Because encoded text cannot be interpreted and processed without knowing the encoding, it is vitally important that the character encoding (see 4.1 Character Encoding) is known at all times and places where text is exchanged, stored or processed. In what follows we use 'character encoding' to mean either character encoding form (CEF) or character encoding scheme (CES) depending on the context. When text is transmitted or stored as a byte stream, for instance in a protocol or file system, specification of a CES is required to ensure proper interpretation. In contexts such as an API, where the environment (typically the processor architecture) specifies the byte order of multibyte quantities, specification of a CEF suffices.
</span></p>

<div class="req"><h5 id="C015">C015<i>[仕様]</i></h5>
<p>
仕様は、一意的な文字符号化法を指定するか, または
テキストの符号化法が確実に識別されるような，文字符号化法を識別するための仕組み<!--1-->を提供しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C015 [S] Specifications MUST either specify a unique character encoding, or provide character encoding identification mechanisms such that the encoding of text can be reliably identified.
</span></p></div>

<div class="req"><h5 id="C016">C016<i>[仕様]</i></h5>
<p>
新たな
プロトコル／データ形式／ API ／仕様
が設計される際には、一意的な文字符号化法が要求される<em class="rfc2119">べき</em>である。
<span lang="en">
C016 [S] When designing a new protocol, format or API, specifications SHOULD require a unique character encoding.
</span></p></div>

<div class="req"><h5 id="C017">C017<i>[仕様]</i></h5>
<p>
［
プロトコル／データ形式, あるいは プロトコル／データ形式 上の API
］に基づく仕様や,
文字符号化法のための規則をすでに備えている API の仕様は、それらの規則を変更せずに，そのまま利用する<em class="rfc2119">べき</em>である。
<span lang="en">
C017 [S] When basing a protocol, format, or API on a protocol, format, or API that already has rules for character encoding, specifications SHOULD use rather than change these rules.
</span></p></div>

<p class="example"><b>例：</b>
XML に基づくデータ形式が外部実体の文字符号化法を選定／決定する際には、新たなものを考案せずに，既存の XML 規則を利用するべきである。
<span lang="en">
EXAMPLE: An XML-based format should use the existing XML rules for choosing and determining the character encoding of external entities, rather than invent new ones.
</span></p>


		</section>
		<section id="sec-UniqueEncoding">

<h4 title="Mandating a unique character encoding">4.4.1. 一意的な文字符号化法の義務付け</h4>

<p>
一意的な文字符号化法の義務付けは単純かつ効率的で堅牢になる。
符号化法タグを 指定／生成／伝送／解釈 する必要もなくなり、受信側からは，文字符号化法が常に解されることになる。
データが非電子媒体を通して転送された後で，元のデジタル表現に復元する必要が生じた場合でも、利用された文字符号化法についての曖昧さは生じない。
既存のデータ, システム, プロトコル, アプリケーションとの互換性のために 複数の文字符号化法が必要になる場合でも、それらは，しばしば プロトコル, データ形式, API の境界, あるいはそれらの外側で取り扱い得る。
<abbr title="Document Object Model">DOM</abbr>
<a href="#dom1">[DOM Level 1]</a>
は、これが行われている例になる。
一意的な文字符号化法の選定により、少量のテキストを扱うときや, 仕様が実際の処理に密接なときには，より大きな優位性が得られる。
<span lang="en">
Mandating a unique character encoding is simple, efficient, and robust. There is no need for specifying, producing, transmitting, and interpreting encoding tags. At the receiver, the character encoding will always be understood. There is also no ambiguity as to which character encoding to use if data is transferred non-electronically and later has to be converted back to a digital representation. Even when there is a need for compatibility with existing data, systems, protocols and applications, multiple character encodings can often be dealt with at the boundaries or outside a protocol, format, or API. The DOM [DOM Level 1] is an example of where this was done. The advantages of choosing a unique character encoding are greater when text sizes are small or the specification is close to the actual processing.
</span></p>

<div class="req"><h5 id="C018">C018<i>[仕様]</i></h5>
<p>
一意的な文字符号化法が要求される場合、その文字符号化法は UTF-8, UTF-16, UTF-32 のいずれかでなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C018 [S] When a unique character encoding is required, the character encoding MUST be UTF-8, UTF-16 or UTF-32.
</span></p></div>

<p>
UTF-8 は US-ASCII の上位互換なので（ US-ASCII 文字列は UTF-8 文字列でもある —
<a href="#rfc3629">[RFC 3629]</a>
を見よ）、 US-ASCII との互換性が望まれる場合には UTF-8 が適切になる。
他の状況，例えば API などでは、 UTF-16 や UTF-32 がより適切なものになり得る。
これらを選定する際に考慮される理由には、内部処理の効率性や, 他の処理との相互運用性などが挙げられる。
<span lang="en">
US-ASCII is upwards-compatible with UTF-8 (an US-ASCII string is also a UTF-8 string, see [RFC 3629]), and UTF-8 is therefore appropriate if compatibility with US-ASCII is desired. In other situations, such as for APIs, UTF-16 or UTF-32 may be more appropriate. Possible reasons for choosing one of these include efficiency of internal processing and interoperability with other processes.
</span></p>

<p class="note"><b>注記：</b>
IETF Charset Policy
<a href="#rfc2277">[RFC 2277]</a>
には、
<q cite="#rfc2277" >プロトコルは UTF-8 charset を利用できなければ<em class="rfc2119">ならない</em></q>
と指定されている。
<span lang="en">
NOTE: The IETF Charset Policy [RFC 2277] specifies that on the Internet "Protocols MUST be able to use the UTF-8 charset".
</span></p>

<p class="note"><b>注記：</b>
XML 1.0 仕様
<a href="#xml10">[XML 1.0]</a>
では、すべての適合 XML プロセッサに対し， UTF-16 と UTF-8 のいずれも受け入れることを要求している。
<span lang="en">
NOTE: The XML 1.0 specification [XML 1.0] requires all conforming XML processors to accept both UTF-16 and UTF-8.
</span></p>

		</section>
		<section id="sec-EncodingIdent">

<h4 title="Character encoding identification">4.4.2. 文字符号化法の識別</h4>

<p>
MIME Internet 仕様は、文字符号化法を識別するための仕組み<!--1-->
<a href="#MIME-charset">[MIME-charset]</a>
<a href="#RFC2978">[RFC 2978]</a>
の好例である。
MIME <code class="keyword">charset</code> パラメタは、受信されたデータのバイト列を 文字の並びに一意的に復号するに足る情報を供するものとして、定義されている。
その値は IANA charset レジストリ
<a href="#iana">[IANA]</a>
から抜き出されたものになる。
<span lang="en">
The MIME Internet specification provides a good example of a mechanism for character encoding identification [MIME-charset][RFC 2978]. The MIME charset parameter definition is intended to supply sufficient information to uniquely decode the sequence of bytes of the received data into a sequence of characters. The values are drawn from the IANA charset registry [IANA].
</span></p>

<p class="note"><b>注記：</b>
具合が悪いことに、一部の charset 識別子は，単一の, 一意的な文字符号化法を表現していない。
これらの識別子は小さな多様性を孕んでいる。
小さくても，その違いは重大になり得るし, 時の経過と伴に変わり得る。
これらの識別子の下では、バイト並びから文字並びへ復元する際に確定できない部分が生じる。 
例えば Shift_JIS では， 0x5C に符号化された文字が確定できない。
この符号位置は
<span class="uname">YEN SIGN</span>
を表現することもあれば，
<span class="uname">REVERSE SOLIDUS</span>
を表現することもある。
この例についての詳細および，他のその種の曖昧性を孕む charset 識別子については
<a href="#XML_Japanese_profile">[XML Japanese Profile]</a>
を参照。
<span lang="en">
NOTE: Unfortunately, some charset identifiers do not represent a single, unique character encoding. Instead, these identifiers denote a number of small variations. Even though small, the differences may be crucial and may vary over time. For these identifiers, recovery of the character sequence from a byte sequence is ambiguous. For example, the character encoded as 0x5C in Shift_JIS is ambiguous. This code point sometimes represents a YEN SIGN and sometimes represents a REVERSE SOLIDUS. See the [XML Japanese Profile] for more detail on this example and for additional examples of such ambiguous charset identifiers.
</span></p>

<p class="note" id="def-charset"><b>注記：</b>
語
<dfn id="charset">charset</dfn>
は、
“<span class="qterm">文字集合</span>”
（ character set ）に由来するもので，長く込み入った歴史を伴う表記である（更なる論は
<a href="#connolly">[Connolly]</a>
を参照）。
<span lang="en">
NOTE: The term charset derives from 'character set', an expression with a long and tortured history (see [Connolly] for a discussion).
</span></p>

<div class="req"><h5 id="C020">C020<i>[仕様]</i></h5>
<p>
仕様は、文字符号化法の参照に語
“<span class="qterm">文字集合</span>”
や
“<span class="qterm">charset</span>”
を用いる<em class="rfc2119">べき</em>でない。
ただし、後者については，
MIME <code class="keyword">charset</code>
パラメタあるいは その IANA に登録済みの値を参照する際に利用される場合は除く。
用語
“<span class="qterm">文字符号化法</span>”
，または 特定の場合には用語
“<span class="qterm">文字符号化形式</span>”
または
“<span class="qterm">文字符号化スキーム</span>”
の利用が<em class="rfc2119">推奨される</em>。
<span lang="en">
C020 [S] Specifications SHOULD avoid using the terms 'character set' and 'charset' to refer to a character encoding, except when the latter is used to refer to the MIME charset parameter or its IANA-registered values. The term 'character encoding', or in specific cases the terms 'character encoding form' or 'character encoding scheme', are RECOMMENDED.
</span></p></div>

<p class="note"><b>注記：</b>
XML においては、
XML 宣言／テキスト宣言に含められた
<code>encoding</code>
疑似属性から， IANA charset を利用する文字符号化法が識別される。
<!-- テキスト宣言
http://www.atmarkit.co.jp/fxml/rensai/w3cread26/w3cread26_2.html -->
<span lang="en">
NOTE: In XML, the XML declaration or the text declaration contains the encoding pseudo-attribute which identifies the character encoding using the IANA charset.
</span></p>

<p>
IANA charset レジストリは、 Internet 上の文字符号化スキーム名とそれらの別名が含められている，公式のリストである。
<span lang="en">
The IANA charset registry is the official list of names and aliases for character encoding schemes on the Internet.
</span></p>

<div class="req"><h5 id="C021">C021<i>[仕様]</i></h5>
<p>
仕様は、一意的な符号化法を採用しない場合には，
IANA charset レジストリに含まれる名前の利用を
—
特に，
プロトコル／データ形式／ API
の文字符号化法として指定<!--1-->する際には，そのレジストリの中の
“<span class="qterm">推奨 MIME 名</span>”
（ MIME preferred names ）として識別される名前の利用を
—
要求する<em class="rfc2119">べき</em>である。
<span lang="en">
C021 [S] If the unique encoding approach is not taken, specifications SHOULD require the use of the IANA charset registry names, and in particular the names identified in the registry as 'MIME preferred names', to designate character encodings in protocols, data formats and APIs.
</span></p></div>

<div class="req"><h5 id="C022">C022<i>[仕様]</i><i>[実装]</i><i>[内容]</i></h5>
<p>
IANA レジストリに含まれていない文字符号化法は、私的な合意が無い限り，利用される<em class="rfc2119">べき</em>でない。
<span lang="en">
C022 [S] [I] [C] Character encodings that are not in the IANA registry SHOULD NOT be used, except by private agreement.
</span></p></div>

<div class="req"><h5 id="C023">C023<i>[仕様]</i><i>[実装]</i><i>[内容]</i></h5>
<p>
未登録の文字符号化法が利用される場合、名前の先頭に “<span class="qterm">x-</span>” を付与する慣行に従わなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C023 [S] [I] [C] If an unregistered character encoding is used, the convention of using 'x-' at the beginning of the name MUST be followed.
</span></p></div>

<div class="req"><h5 id="C049">C049<i>[実装]</i><i>[内容]</i></h5>
<p>
内容に対する文字の符号化は、利用する側からは解されそうにないような，解り難い符号化法は避けつつ，なるべく直接的に文字を表現し得る方法（すなわち，
<a href="#def-char-escape">文字エスケープ</a>
などの
<a href="#def-char-data">マークアップ</a>
で文字を表現する必要を最小限に抑える方法）を採る<em class="rfc2119">べき</em>である。
<span lang="en">
C049 [I] [C] The character encoding of content SHOULD be chosen so that it maximizes the opportunity to directly represent characters (ie. minimizes the need to represent characters by markup means such as character escapes) while avoiding obscure encodings that are unlikely to be understood by recipients.
</span></p></div>

<p class="note"><b>注記：</b>
巨大なレパートリを備え, 広範からのサポートもある Unicode に基づく文字符号化法は、文書の符号化として好ましい選定である。
<span lang="en">
NOTE: Due to Unicode's large repertoire and wide base of support, a character encoding based on Unicode is a good choice to encode a document.
</span></p>

<div class="req"><h5 id="C034">C034<i>[内容]</i></h5>
<p>
文字符号化法を識別するための機構が既に用意されている場合、内容は，それらを利用しなければ<em class="rfc2119">ならない</em>。
“既に用意されている” には、既定として定められるもの（例えば XML 1.0
<a href="#xml10">[XML 1.0]</a>
のもの）も含まれる。
その種の既定への依拠は、この，識別のための要件を満たすに足る。
<span lang="en">
C034 [C] If facilities are offered for identifying character encoding, content MUST make use of them; where the facilities offered for character encoding identification include defaults (e.g. in XML 1.0 [XML 1.0]), relying on such defaults is sufficient to satisfy this identification requirement.
</span></p></div>

<div class="req"><h5 id="C024">C024<i>[実装]</i><i>[内容]</i></h5>
<p>
テキストデータにラベル付けする 内容, およびソフトウェアは、適切な仕様（例えば XML テキストを編集する場合は XML 仕様）にて要求されている，いずれかの名前を利用しなければ<em class="rfc2119">ならない</em>。
また、 MIME preferred name に属する名前に対応する文字符号化法のデータにラベル付けする際には，その名前を利用する<em class="rfc2119">べき</em>である。
<span lang="en">
C024 [I] [C] Content and software that label text data MUST use one of the names required by the appropriate specification (e.g. the XML specification when editing XML text) and SHOULD use the MIME preferred name of a character encoding to label data in that character encoding.
</span></p>

<p class="trans-note">【
ラベル — データに利用されている文字符号化法を識別するために付与される名前
】</p>

</div>

<div class="req"><h5 id="C025">C025<i>[実装]</i><i>[内容]</i></h5>
<p>
IANA に登録されたどの名前にも対応しないような文字符号化法によるテキストデータに対し、そのラベル付けに IANA に登録済みの
<code class="keyword">charset</code>
名が利用されては<em class="rfc2119">ならない</em>。
<span lang="en">
C025 [I] [C] An IANA-registered charset name MUST NOT be used to label text data in a character encoding other than the one identified in the IANA registration of that name.
</span></p></div>

<div class="req"><h5 id="C026">C026<i>[仕様]</i></h5>
<p>
一意的な符号化法を採用しない仕様は、少なくとも Unicode の UTF-8, UTF-16 符号化形式いずれかを，適格な文字符号化法として指定<!--1-->しなければ<em class="rfc2119">ならない</em>。
また、 UTF-8 または UTF-16 のいずれかを，要求される符号化形式（仕様の実装からサポートされなければ<em class="rfc2119">ならない</em>符号化形式）として採用する<em class="rfc2119">べき</em>である。
<span lang="en">
C026 [S] If the unique encoding approach is not chosen, specifications MUST designate at least one of the UTF-8 and UTF-16 encoding forms of Unicode as admissible character encodings and SHOULD choose at least one of UTF-8 or UTF-16 as required encoding forms (encoding forms that MUST be supported by implementations of the specification).
</span></p></div>

<div class="req"><h5 id="C027">C027<i>[仕様]</i></h5>
<p>
既定の符号化法を要する仕様は、
UTF-8 と UTF-16 のいずれか一方を, あるいは
それらの判別に適する手段を定める場合は両者を，既定のものとして定義しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C027 [S] Specifications that require a default encoding MUST define either UTF-8 or UTF-16 as the default, or both if they define suitable means of distinguishing them.
</span></p></div>

<div class="req"><h5 id="C028">C028<i>[仕様]</i></h5>
<p>
仕様は、データの符号化法を決定する方法に経験則の利用を提案しては<em class="rfc2119">ならない</em>。
<span lang="en">
C028 [S] Specifications MUST NOT propose the use of heuristics to determine the encoding of data.
</span></p></div>

<p>
経験則の例としては、バイト（パタン）や文字（パタン）の頻度に対する統計的解析の利用が挙げられる。
経験則は実装間でふるまいが一貫しないので好ましくない。
XML 1.0
<a href="#xml10">[XML 1.0]</a>,
<a href="http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing">Appendix F</a>
にて与えられるような、明確な指示書きにより，文字符号化法を一義的に決定する方法は、経験則とは見なされない。
<span lang="en">
Examples of heuristics include the use of statistical analysis of byte (pattern) frequencies or character (pattern) frequencies. Heuristics are bad because they will not work consistently across different implementations. Well-defined instructions of how to unambiguously determine a character encoding, such as those given in XML 1.0 [XML 1.0], Appendix F, are not considered heuristics.
</span></p>

<div class="req"><h5 id="C029">C029<i>[実装]</i></h5>
<p>
<em>受信側の</em>ソフトウェアは、データの符号化法を，利用可能な情報から適切な仕様に従って決定しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C029 [I] Receiving software MUST determine the encoding of data from available information according to appropriate specifications.
</span></p></div>

<div class="req"><h5 id="C030">C030<i>[実装]</i></h5>
<p>
IANA に登録済みの
<code class="keyword">charset</code>
名が認識されたときは、受信側のソフトウェアは、受信されたデータを IANA レジストリの中の その名前に結び付けられている符号化法に従って，解釈しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C030 [I] When an IANA-registered charset name is recognized, receiving software MUST interpret the received data according to the encoding associated with the name in the IANA registry.
</span></p></div>

<div class="req"><h5 id="C031">C031<i>[実装]</i></h5>
<p>
charset が供されていないときは、受信側のソフトウェアは、仕様に指定されている既定の文字符号化法を尊守しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C031 [I] When no charset is provided receiving software MUST adhere to the default character encoding(s) specified in the specification.
</span></p></div>

<p>
受信側のソフトウェアは、必要に応じて，任意数の
文字符号化法および charset 名とその別名
を認識してよい。
<span lang="en">
Receiving software may recognize as many character encodings and as many charset names and aliases for them as appropriate.
</span></p>

<p>
field-upgradeable
<span class="trans-note">【具体的には何？】</span>
の仕組み<!--1-->は、この目的に適切なものになる。
一部の文字符号化法は、ある程度 一定の言語との結び付きが強い（例えば Shift_JIS と日本語）。
与えられた言語や一定層の顧客のサポートは、一定の文字符号化法のサポートの必要を意味し得る。
しかしながら、支持は得ているが要求されてはいない符号化法に対する世界共通サポートが必要と見なされているわけではない。
サポートを要する文字符号化法は，時の経過に伴い変わり得る。
この文書は、与えられた言語のサポートに，どの文字符号化法が適切／必要であるかについてのアドバイスは供さない。
<span lang="en">
A field-upgradeable mechanism may be appropriate for this purpose. Certain character encodings are more or less associated with certain languages (e.g. Shift_JIS with Japanese). Trying to support a given language or set of customers may mean that certain character encodings have to be supported. However, one cannot assume universal support for a favoured but non-required encoding. The character encodings that need to be supported may change over time. This document does not give any advice on which character encoding may be appropriate or necessary for the support of any given language.
</span></p>

<p>
ウェブアーキテクチャは層の積み重ねであるので（例えばプロトコルを通して利用されるデータ形式）、文字符号化法についての情報が複数あったり競合することが起こり得る。
<span lang="en">
Because of the layered Web architecture (e.g. formats used over protocols), there may be multiple and at times conflicting information about character encoding.
</span></p>

<div class="req"><h5 id="C035">C035<i>[仕様]</i></h5>
<p>
仕様は、文字符号化法についての情報が複数ある, あるいは競合する場合に，それを解決する仕組み<!--1-->（例えば優先順位）を規定しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C035 [S] Specifications MUST define conflict-resolution mechanisms (e.g. priorities) for cases where there is multiple or conflicting information about character encoding.
</span></p></div>

<div class="req"><h5 id="C033">C033<i>[実装]</i></h5>
<p>
ソフトウェアは、文字符号化法の識別と競合の解決のための仕組み<!--1-->を，完全に実装しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C033 [I] Software MUST completely implement the mechanisms for character encoding identification and conflict resolution.
</span></p></div>


		</section>
	</section>
	<section id="sec-PrivateUse">

<h3 title="Private use code points">4.5. 私用のための符号位置</h3>

<p>
Unicode の中の一定範囲の
<a href="#def-CCS">符号位置</a>
—
私用領域 （ <abbr title="Private Use Area">PUA</abbr> — U+E000-F8FF ）と第 15, 16 面（ U+F0000-FFFFD と U+100000-10FFFD ）
—
は、私用用途としての指定<!--1-->を受けている。
これらの符号位置は，標準文字には決して割り当てられないことが保証されており、私的合意の下に利用できる。
しかしながら、私的合意は，異なるそれらの間で符号位置が衝突し得るので，ウェブ上まで拡大されることはない。
また、私的合意であるがため，その符号位置の意味は急速に失われ易い.
<span lang="en">
Certain ranges of Unicode code points are designated for private use: the Private Use Area (PUA) (U+E000-F8FF) and planes 15 and 16 (U+F0000-FFFFD and U+100000-10FFFD). These code points are guaranteed to never be allocated to standard characters, and are available for use by private agreement. However, private agreements do not scale on the Web. Code points from different private agreements may collide. Also, a private agreement, and therefore the meaning of the code points, can quickly become lost.
</span></p>

<div class="req"><h5 id="C073">C073<i>[内容]</i></h5>
<p>
公に交換される内容は、私用領域の符号位置を利用する<em class="rfc2119">べき</em>でない。
<span lang="en">
C073 [C] Publicly interchanged content SHOULD NOT use codepoints in the private use area.
</span></p></div>

<p class="note"><b>注記：</b>
例外的な PUA の利用として代表的なものには、符号化されたことのない用字系（例えば 歴史的あるいは稀なもの）に対する符号化法の設計, あるいは試験が挙げられる。
<span lang="en">
NOTE: A typical exception would be the use of the PUA to design and test the encoding of not yet encoded (e.g. historic or rare) scripts.
</span></p>

<div class="req"><h5 id="C076">C076<i>[内容]</i></h5>
<p>
内容は、符号位置を，その符号化文字集合に定義されている目的以外に利用しては<em class="rfc2119">ならない</em>。
<span lang="en">
C076 [C]Content MUST NOT use a code point for any purpose other than that defined by its coded character set.
</span></p></div>

<p>
したがって、例えば，
iso-8859-1 に符号化されるものとは実際には異なる
用字系／文字／記号
を表現する目的で， ISO Latin 1 文字集合の符号位置を意図的に誤利用するようなフォントの構築などは、禁止される。
<span lang="en">
This prohibits, for example, the construction of fonts that misuse the codepoints in the ISO Latin 1 character set to represent different scripts, characters, or symbols than those actually encoded in iso-8859-1.
</span></p>

<div class="req"><h5 id="C038">C038<i>[仕様]</i></h5>
<p>
仕様は、特定の割り当てを伴うような私用領域の文字の利用を要求しては<em class="rfc2119">ならない</em>。
<span lang="en">
C038 [S] Specifications MUST NOT require the use of private use area characters with particular assignments.
</span></p></div>

<div class="req"><h5 id="C039">C039<i>[仕様]</i></h5>
<p>
仕様は、私用符号位置についての合意を定義する仕組み<!--1-->の利用を要求しては<em class="rfc2119">ならない</em>。
<span lang="en">
C039 [S] Specifications MUST NOT require the use of mechanisms for defining agreements of private use code points.
</span></p></div>

<div class="req"><h5 id="C040">C040<i>[仕様]</i><i>[実装]</i></h5>
<p>
仕様と実装は、私的合意による私用符号位置の利用を禁ずる<em class="rfc2119">べき</em>でない。
<span lang="en">
C040 [S] [I] Specifications and implementations SHOULD NOT disallow the use of private use code points by private agreement.
</span></p></div>

<p>
例えば XML は，私用符号位置の利用を禁じていない。
<span lang="en">
As an example, XML does not disallow the use of private use code points.
</span></p>

<div class="req"><h5 id="C041">C041<i>[仕様]</i></h5>
<p>
仕様は、 Unicode に無い記号の伝達や Unicode 文字の特定の異体字の識別を可能にするための，
<a href="#def-char-data">マークアップ</a>
を定義しても<em class="rfc2119">よい</em>。
<span lang="en">
C041 [S] Specifications MAY define markup to allow the transmission of symbols not in Unicode or to identify specific variants of Unicode characters.
</span></p></div>

<p class="example"><b>例：</b>
MathML （
<a href="#mathml2">[MathML2]</a>
<a href="http://www.w3.org/TR/2003/REC-MathML2-20031021/chapter3.html#presm.mglyph">3.2.9 節</a>
）は、 Unicode には無い数学記号のために <code>mglyph</code> 要素を定義している。
<span lang="en">
EXAMPLE: MathML (see [MathML2] section 3.2.9) defines an element mglyph for mathematical symbols not in Unicode.
</span></p>

<p class="example"><b>例：</b>
SVG （
<a href="#svg">[SVG]</a>
<a href="http://www.w3.org/TR/2003/REC-SVG11-20030114/text.html#AlternateGlyphs">10.14 節</a>
）は、 Unicode 文字の特定の表示異体の識別を可能にする，
<code>altglyph</code> 要素を定義する。
<span lang="en">
EXAMPLE: SVG (see [SVG] section 10.14) defines an element altglyph which allows the identification of specific display variants of Unicode characters.
</span></p>

<div class="req"><h5 id="C068">C068<i>[仕様]</i></h5>
<p>
仕様は、絵図やグラフィックのために，文字に基づく仕組み<!--1-->を（誤）利用せずに済ませられるようにするため、適切な所で 絵図やグラフィックの埋め込みや参照を許容する<em class="rfc2119">べき</em>である。
<span lang="en">
C068 [S]Specifications SHOULD allow the inclusion of or reference to pictures and graphics where appropriate, to eliminate the need to (mis)use character-oriented mechanisms for pictures or graphics.
</span></p></div>

	</section>
	<section id="sec-Escaping">

<h3 title="Character Escaping">4.6. 文字エスケープ法</h3>

<p id="def-syntax-significant">
マークアップ言語やプログラミング言語では、しばしば，一定の文字が
<dfn id="syntax-significant">構文文字</dfn>
として指定<!--1-->され，言語における特定の機能が与えられる（例えば HTML と XML では
<span class="qchar">&lt;</span> と <span class="qchar">&amp;</span>
がマークアップ区切子になる）。
これらの構文文字は、テキストの中では他の文字と同じように自身を自身の表現に利用することができないので，何らかの “エスケープ” の仕組み<!--1-->が必要になる。
また、しばしば これと同じあるいは同様の仕組み<!--1-->により実現されるが、個々の文書やプログラム（マークアップやプログラミング言語のインスタンス）にて選定された文字符号化法の中では，直接的に表現できない文字も、表記できる必要がある。
<span lang="en">
Markup languages or programming languages often designate certain characters as syntax-significant, giving them specific functions within the language (e.g. '&lt;' and '&amp;' serve as markup delimiters in HTML and XML). As a consequence, these syntax-significant characters cannot be used to represent themselves in text in the same way as all other characters do, creating the need for a mechanism to "escape" their syntax-significance. There is also a need, often satisfied by the same or similar mechanisms, to express characters not directly representable in the character encoding chosen for a particular document or program (an instance of the markup or programming language).
</span></p>

<p id="def-char-escape">
公式的に述べるなら、
<dfn id="character-escape">文字エスケープ</dfn>
とは、マークアップやプログラミング言語により定義される 構文上の素子であって，以下のいずれかを許容するものである：
<span lang="en">
Formally, a character escape is a syntactic device defined in a markup or programming language that allows one or more of:
</span></p>

<ol>
	<li>
言語における構文上の意味を失わせつつ，構文文字を表記する,
または
<span lang="en">
expressing syntax-significant characters while disregarding their significance in the syntax of the language, or
</span></li>
	<li>
個々の言語にて選定された文字符号化法では表現できない文字を表記する,
または
<span lang="en">
expressing characters not representable in the character encoding chosen for an instance of the language, or
</span></li>
	<li>
一般の文字を，それに対応する符号化文字を利用せずに表記する。
<span lang="en">
expressing characters in general, without use of the corresponding encoded characters.
</span></li>
</ol>

<hr>

<p>
文字を
<dfn id="escaping">エスケープ</dfn>
するとは、その文字が現れるデータ形式／プロトコルに適切な構文上の素子を利用して，その文字を表記することを意味する。
<dfn id="expanding-a-character-escape">文字エスケープの展開</dfn>
（あるいは
<dfn id="unescaping">アンエスケープ</dfn>
） とは、それが表現する文字に置換することを意味する。
<span lang="en">
Escaping a character means expressing it using such a syntactic device, appropriate to the format or protocol in which the character appears; expanding a character escape (or unescaping) means replacing it with the character that it represents.
</span></p>

<p class="example"><b>例：</b>
HTML と XML では、構文文字と任意の Unicode 文字による表記の，両者のエスケープ法を許容する “<span class="qterm">数値文字参照</span>”
が規定されている。
<samp >&amp;#x3C;</samp> あるいは <samp >&amp;#60;</samp> と表記された文字
<span class="qchar">&lt;</span>
は、マークアップ区切子として構文解析されないようになる。
<span lang="en">
EXAMPLE: HTML and XML define 'Numeric Character References' which allow both the escaping of syntax-significance and the expression of arbitrary Unicode characters. Expressed as &amp;#x3C; or &amp;#60; the character '&lt;' will not be parsed as a markup delimiter.
</span></p>

<p class="example"><b>例：</b>
プログラミング言語 Java は、文字列の区切りに二重引用符
<span class="qchar">"</span>
を利用している。
文字列内で
<span class="qchar">"</span>
を表記するときは、
<span class="qchar">\"</span>
のように，エスケープを施す。
<span lang="en">
EXAMPLE: The programming language Java uses '"' to delimit strings. To express '"' within a string, one may escape it as '\"'.
</span></p>

<p class="example"><b>例：</b>
XML には、 CDATA セクション区切子の間のすべての構文文字がエスケープされるようにする
“<span class="qterm">CDATA セクション</span>” が規定されている。
CDATA セクションの中では、数値文字参照を利用する文字の表記はできなくなる。
<span lang="en">
EXAMPLE: XML defines 'CDATA sections' which allow escaping the syntax-significance of all characters between the CDATA section delimiters. CDATA sections prevent the expression of characters using numeric character references.
</span></p>

<p>
仕様が文字エスケープを定義する際の方法には、以下の指針が適用される：
<span lang="en">
The following guidelines apply to the way specifications define character escapes.
</span></p>

<ul>
	<li>

<!-- p id="C0000" -->
<div class="req"><h5 id="C042">C042<i>[仕様]</i></h5>
<p>
仕様は、適切なものがすでにあるならば，新たなエスケープ法の仕組み<!--1-->を考案する<em class="rfc2119">べき</em>でない。
<span lang="en">
C042 [S] Specifications SHOULD NOT invent a new escaping mechanism if an appropriate one already exists.
</span></p></div>

</li>
	<li>

<div class="req"><h5 id="C043">C043<i>[仕様]</i></h5>
<p>
文字をエスケープする方法は，できるだけ少ない種類に絞り込まれる<em class="rfc2119">べき</em>である（理想的には一種類）。
<span lang="en">
C043 [S] The number of different ways to escape a character SHOULD be minimized (ideally to one).
</span></p></div>

<p>
よく知られた反例として、 HTML も XML も，歴史的な理由から２種類の文字エスケープ：
冗長な 10 進（ <samp >&amp;#ddddd;</samp> ）によるものと, 16 進（ <samp >&amp;#xhhhh;</samp> ）によるものを備えている。
<span lang="en">
A well-known counter-example is that for historical reasons, both HTML and XML have redundant decimal (&amp;#ddddd;) and hexadecimal (&amp;#xhhhh;) character escapes.
</span></p>
	</li>
	<li>
<div class="req"><h5 id="C044">C044<i>[仕様]</i></h5>
<p>
エスケープ構文は、各文字エスケープに対し，明示的な終端区切子か一定個数の文字を要するようにする<em class="rfc2119">べき</em>である。
文字エスケープの終端がエスケープの中では適格でない文字から決定されるようなエスケープ構文は、避けられる<em class="rfc2119">べき</em>である。
<span lang="en">
C044 [S] Escape syntax SHOULD require either explicit end delimiters or a fixed number of characters in each character escape. Escape syntaxes where the end is determined by any character outside the set of characters admissible in the character escape itself SHOULD be avoided.
</span></p></div>

<p>
そのような文字エスケープは視覚的に明白でなく，行折り返しの所に誤って改行を挿入し得る。
SPREAD <a href="#spread">[SPREAD]</a> の <samp >&amp;UABCD;</samp> 形や XML の <samp >&amp;#xhhhh;</samp> 形のように，文字エスケープが明示的にセミコロンで終了される方が、ずっと好ましい。
<span lang="en">
These character escapes are not clear visually, and can cause an editor to insert spurious line-breaks when word-wrapping on spaces. Forms like SPREAD's &amp;UABCD; [SPREAD] or XML's &amp;#xhhhh;, where the character escape is explicitly terminated by a semicolon, are much better.
</span></p>

	</li>
	<li>

<div class="req"><h5 id="C045">C045<i>[仕様]</i></h5>
<p>
仕様において，数値を用いる文字エスケープにより文字の表現を規定する際には、その数値がその文字の Unicode 符号位置を表現しなければ<em class="rfc2119">ならない</em>。
また、その数値は 16 進記法にされる<em class="rfc2119">べき</em>である。
<span lang="en">
C045 [S] Whenever specifications define character escapes that allow the representation of characters using a number, the number MUST represent the Unicode code point of the character and SHOULD be in hexadecimal notation.
</span></p></div>
	</li>
	<li>

<div class="req"><h5 id="C046">C046<i>[仕様]</i></h5>
<p>
エスケープされた文字は、その非エスケープ形が受容される所ならどこでも受容される<em class="rfc2119">べき</em>である —
<a href="#def-syntax-significant">構文文字</a>
がエスケープされた際は，それらの構文上の意味は失われる<em class="rfc2119">べき</em>である。
特に、ある文字が識別子やコメント内で受容される場合、その文字のエスケープ形も受容されるべきである。
<span lang="en">
C046 [S] Escaped characters SHOULD be acceptable wherever their unescaped forms are; this does not preclude that syntax-significant characters, when escaped, lose their significance in the syntax. In particular, if a character is acceptable in identifiers and comments, then its escaped form should also be acceptable.
</span></p></div>

	</li>
</ul>

<p>
内容開発者, およびその内容を生成するソフトウェアには、以下の指針が適用される：
<span lang="en">
The following guidelines apply to content developers, as well as to software that generates content:
</span></p>

<ul>
	<li>
<div class="req"><h5 id="C047">C047<i>[実装]</i><i>[内容]</i></h5>
<p>
エスケープは、表記される文字が，文書のデータ形式／文字符号化法の下では直接的に表現できないとき, または 文字の視覚的表現が明白でない所にのみ利用される<em class="rfc2119">べき</em>である。
<span lang="en">
C047 [I] [C] Escapes SHOULD only be used when the characters to be expressed are not directly representable in the format or the character encoding of the document, or when the visual representation of the character is unclear.
</span></p></div>

<p class="note"><b>注記：</b>
文字の視覚的表現が明白でない例としては、 NBSP （ non-breaking space ）と通常のスペースを判別可能にするための， <samp>&amp;nbsp;</samp> の利用が挙げられる。
<span lang="en">
NOTE: An example of when the visual representation of the character is unclear is the use of &amp;nbsp; to distinguish a non-breaking space from a normal space.
</span></p>
	</li>
	<li>
<div class="req"><h5 id="C048">C048<i>[実装]</i><i>[内容]</i></h5>
<p>
内容は、文字エスケープに 16 進形と 10 進形のいずれも利用できるときは、16 進形を利用する<em class="rfc2119">べき</em>である。
<span lang="en">
C048 [I] [C] Content SHOULD use the hexadecimal form of character escapes rather than the decimal form when there are both.
</span></p></div>

<p class="note"><b>注記：</b>
大抵の文字符号化法の標準仕様（特に Unicode ）は、文字の符号値を 16 進数として一覧にするので、16 進形の方が表引きが容易であり，好ましいものとされる。
<span lang="en">
NOTE: The hexadecimal form is preferred because character encoding standards (in particular Unicode) usually list character numbers as hexadecimal, making lookup easier.
</span></p>
	</li>
</ul>

	</section>

<section id="sec-Compatibility">

<h2 title="Compatibility and Formatting Characters">5. 互換文字と書式文字</h2>

<p>
この仕様は、
<a href="#def-char-data">マークアップ言語</a>
の利用における個々の文字の適性については言及しない。
特に、書式文字および互換等価。
互換文字と書式文字の利用についての詳細な推奨は、
<cite>Unicode in XML and other Markup Languages</cite>
（ XML と他のマークアップ言語における Unicode ）
<a href="#UXML">[UXML]</a>
を参照のこと。
<span lang="en">
This specification does not address the suitability of particular characters for use in markup languages, in particular formatting characters and compatibility equivalents. For detailed recommendations about the use of compatibility and formatting characters, see Unicode in XML and other Markup Languages [UXML].
</span></p>

<div class="req"><h5 id="C050">C050<i>[仕様]</i></h5>
<p>
仕様は、それが定めるデータ形式の構文上の要素（マークアップ, 区切子, 識別子）から，互換文字を除外する<em class="rfc2119">べき</em>である。
<span lang="en">
C050 [S] Specifications SHOULD exclude compatibility characters in the syntactic elements (markup, delimiters, identifiers) of the formats they define.
</span></p></div>

</section>
<section id="sec-Indexing">

<h2 title="Strings">6. 文字列</h2>

	<section id="sec-Strings">

<h3 title="String concepts">6.1. 文字列の概念（ concept ）</h3>

<p>
様々な仕様が “<span class="qterm">文字列</span>” の認識概念（ notion ）を利用するが、その意味の定義が精確に与えられなかったり，他の仕様と異なるように定義されることもある。
実際、文字列には その認識概念の意図される利用に依存して，複数の理に適った定義がある。
これらは、実際には同じ実在 — コンピュータ内に格納されるテキスト片 — に対し，異なる見方を与えるものに過ぎないので、これら異なる認識概念のどれにも，語
“<span class="qterm">文字列</span>”
が利用されるのである。
<span lang="en">
Various specifications use the notion of a 'string', sometimes without defining precisely what is meant and sometimes defining it differently from other specifications. The reason for this variability is that there are in fact multiple reasonable definitions for a string, depending on one's intended use of the notion; the term 'string' is used for all these different notions because these are actually just different views of the same reality: a piece of text stored inside a computer.
</span></p>

<p class="trans-note">【
文字列<!--0-->（ string ） — 
訳語としては，この語が定着しているが、本来は “<em>同じもの</em>（同種のもの）の並び” を意味し、 “<em>文字</em>の列” よりも抽象的な概念としても用いられる。
】</p>


<p id="def-byte-string">
<dfn id="byte-string">バイト文字列</dfn>：
特定の文字符号化法の下で、文字を，それを表現するバイト列として捉えたときの文字列。
これは
<a href="#def-CES">文字符号化スキーム (CES)</a>
に対応する。
バイト文字列のテキスト処理は、それに利用されている特定の符号化法に依存する。
符号化法が変更された場合，その処理も，新たな符号化法の構造が反映されるように変更されなければならなくなる。
その種の変更は、そのバイト文字列がテキストとして処理される際に利用される関数や API に対し，大きな設計変更を要し得る。
したがって この定義は、文字列のテキストとしての特質が重要ではない，文字列が単にバイト長を伴う “不透明な” データ片と見なされるときにのみ（バッファを複製するときなど）、仕様において有用になる。
<span lang="en">
Byte string: A string viewed as a sequence of bytes representing characters in a particular character encoding. This corresponds to a character encoding scheme (CES). Text processing of a byte string is dependent on the particular encoding used. When the encoding changes the processing must also be changed to reflect the stucture of the new encoding. Such a change could require significant redesign of the functions or API used to process the byte strings as text. Therefore, this definition is only useful in specifications when the textual nature of a string is unimportant and the string is considered only as a piece of opaque data with a length in bytes (such as when copying a buffer).
</span></p>

<div class="req"><h5 id="C011">C011<i>[仕様]</i></h5>
<p>
仕様は、文字列を
“<span class="qterm">バイト文字列</span>”
として定義する<em class="rfc2119">べき</em>でない。
<span lang="en">
C011 [S] Specifications SHOULD NOT define a string as a 'byte string'.
</span></p></div>

<div class="example"><p><b>例：</b>
次の例は、文字列をバイト文字列とみなすことが問題になり得る理由の一つを説明するものである：
ビッグエンディアン バイト順で UTF-16 に符号化された（ UTF-16BE ）文字<!-- 𣎴 --> U+233B4 （“切り株” を意味する，中国語の文字）を含むテキストを考える。
このテキストは，バイト列 D8 4C DF B4 を含むことになる。
このテキストがバイト文字列と見なされた場合、文字 U+4CDF <!-- 䳟 -->（ “不死鳥” を意味する別の中国語の文字）が検索される際に， U+4CDF の UTF-16BE 表現であるバイト列 4C DF に誤って合致することになる。
<span lang="en">
EXAMPLE: This is a counter-example, illustrating one reason why considering strings as byte strings may be problematic. Consider text containing the character U+233B4 (a Chinese character meaning 'stump of tree') encoded as UTF-16 in big-endian byte order (UTF-16BE). The text will contain the bytes D8 4C DF B4. If one searches this text, considered as a byte string, for the character U+4CDF (another Chinese character meaning 'phoenix'), an erroneous match will be found on the bytes 4C DF that are the UTF-16BE representation of U+4CDF.
</span></p></div>

<p id="def-physical-string">
<dfn id="code-unit-string">符号単位文字列</dfn>：
特定の文字符号化法の下で、文字を，それを表現する
<a href="#def-CEF">符号単位</a>
の並びとして捉えたときの文字列。
これは
<a href="#def-CEF">文字符号化形式 (CEF)</a>
に対応する。
符号単位文字列を定義するためには、符号単位のサイズ（例えば 16 ビット）, および
利用される文字符号化法（例えば UTF-16 ）の選定を要する。
符号単位文字列は、実装の候補にされている符号化形式について依拠できる知識に基づいた，文字列データの物理的表現を公開する API において有用になる。
例えば， DOM
<a href="#dom1">[DOM Level 1]</a>,
では、広範な実装の実践に基づいて， UTF-16 が選定されている。
一般的に、“符号単位文字列” は， UTF-16 または UTF-32 が実装の候補にされている場合にのみ有用になる。
<span lang="en">
Code unit string: A string viewed as a sequence of code units representing characters in a particular character encoding. This corresponds to a character encoding form (CEF). A definition of a code unit string needs to include the size of the code units (e.g. 16 bits) and the character encoding used (e.g. UTF-16). Code unit strings are useful in APIs that expose a physical representation of string data based on reliable knowledge of the encoding forms that are likely candidates for implementation. Example: For the DOM [DOM Level 1], UTF-16 was chosen based on widespread implementation practice. In general, 'code unit string' is only useful if the implementation candidates are likely to be either UTF-16 or UTF-32.
</span></p>

<p id="def-character-string">
<dfn id="character-string">文字文字列</dfn>
：
それぞれが Unicode
<a href="#unicode">[Unicode]</a>
の
<a href="#def-CCS">符号位置</a>
で表現される，文字の並びとして捉えたときの文字列。
これは，大方の利用者が知覚する文字と正確には一致しないかもしれないが、プログラマたちが通常，文字列とみなしているものである。
これが、ごく少ない実装の労力で相互運用性を確保できるような，最も高次の抽象化層である。
“文字文字列” の定義による文字列が、一般的に最も有用になる。
この定義を利用する好例には
XML 1.0 <a href="#xml10">[XML 1.0]</a>
の生成規則 “[2]”,
HTML 4.0 <a href="#html401">[HTML 4.01]</a>
の SGML 宣言,
RFC 2070
<a href="#rfc2070">[RFC 2070]</a>
の文字モデルなどが挙げられる。
<span lang="en">
Character string: A string viewed as a sequence of characters, each represented by a code point in Unicode [Unicode]. This is usually what programmers consider to be a string, although it may not match exactly what most users perceive as characters. This is the highest layer of abstraction that ensures interoperability with very low implementation effort. The 'character string' definition of a string is generally the most useful. Good examples using this definition include the Production [2] of XML 1.0 [XML 1.0], the SGML declaration of HTML 4.0 [HTML 4.01], and the character model of RFC 2070 [RFC 2070].
</span></p>

<p class="trans-note">【
文字文字列<!--0-->（ character string ） — 訳語としては不自然な重複になってしまうが、この節の冒頭で注釈した様に， “文字列” という語自体が訳語として定着しているため、致し方ない所でも。
】</p>

<div class="req"><h5 id="C012">C012<i>[仕様]</i></h5>
<p>
大方の仕様には
“<span class="qterm">文字文字列</span>”
の定義が利用される<em class="rfc2119">べき</em>である。
<span lang="en">
C012 [S] The 'character string' definition SHOULD be used by most specifications.
</span></p></div>

<div class="example"><p><b>例：</b>
次の表の各行に、文字［
U+233B4 （“切り株” を意味する，中国語の文字）,
U+2260 <span class="uname">NOT EQUAL TO</span>,
U+0071 <span class="uname">LATIN SMALL LETTER Q</span>,
U+030C <span class="uname">COMBINING CARON</span>
］からなる，ビッグエンディアン バイト順で UTF-16 に符号化された文字列に対し，
<a href="#def-character-string">文字文字列</a>,
<a href="#def-physical-string">符号単位文字列</a>,
<a href="#def-byte-string">バイト文字列</a>
で捉えた様子を順に示す：
<span lang="en">
EXAMPLE: Consider the string comprising the characters U+233B4 (a Chinese character meaning 'stump of tree'), U+2260 NOT EQUAL TO, U+0071 LATIN SMALL LETTER Q and U+030C COMBINING CARON, encoded in UTF-16 in big-endian byte order. The rows of the following table show the string viewed as a character string, code unit string and byte string, respectively:
</span></p>

<table>
<caption>
文字並び／符号単位 列／バイト列としての，文字列
<span lang="en">
table displaying a string viewed as characters, code units and bytes
</span></caption>
<tbody>
<tr><th><a href="#stumpOfTree">グリフ（画像）</a><td colspan="4">
<img
	src="charmod/chineseSurrogate.gif"
	height="25" width="24"
	alt=
"表意文字 補助文字： “切り株” を意味する古代中国文字（現代でも広東語で利用されている）。
Ideographic supplementary character: Archaic Chinese character meaning &quot;the stump of a tree&quot; (still in current use in Cantonese)"
>
<td colspan="2">
<img src="charmod/not_equal.gif" alt="NOT EQUAL TO" height="26" width="25">
<td colspan="2">
<img src="charmod/Q.gif" alt="LATIN SMALL LETTER Q" height="21" width="14">
<td colspan="2">
<img src="charmod/caron.gif" alt="COMBINING CARON" height="21" width="14">

<tr><th class="trans-note">【グリフ（テキスト）】<td colspan="4">&#x233B4;<td colspan="2">&#x2260;<td colspan="2">&#x0071;<td colspan="2">&#x030C;
<tr><th>文字文字列<td colspan="4">U+233B4<td colspan="2">U+2260<td colspan="2">U+0071<td colspan="2">U+030C
<tr><th>符号単位文字列<td colspan="2">D84C<td colspan="2">DFB4<td colspan="2">2260<td colspan="2">0071<td colspan="2">030C
<tr><th>バイト文字列<td>D8<td>4C<td>DF<td>B4<td>22<td>60<td>00<td>71<td>03<td>0C

</tbody></table></div>

<p class="note"><b>注記：</b>
文字列は、
<dfn id="grapheme-clusters">書記素クラスタ</dfn>
の並びとして見ることもできる。
書記素クラスタは、
<a href="#def-character-string">文字文字列</a>
に比して，<!-- 視覚的に -->描画されたテキストにおいて 利用者から知覚される文字の境界に より近似する単位に，テキストを分離する。
書記素クラスタについての論は Unicode 標準, バージョン 4
<a href="#unicode40">[Unicode  4.0]</a>
の 2.10 節の末尾に与えられている。
公式的な定義は Unicode 標準 Annex #29
<a href="#UTR29">[UTR #29]</a>
にて与えられる。
Unicode 標準は <em>既定の</em> 書記素クラスタを定義する。
一部の言語は、この既定をより適切な形に<em >あつらえる</em>ことを要する。
例えば，スロヴァキア語の利用者からは、書記素クラスタの既定のペア
<span class="qchar">ch</span>
が単独の書記素クラスタとして扱われることが望まれるであろう。
文字列内容の言語と末端利用者の選好との間の相互作用は複雑なものになり得ることに注意。
<span lang="en">
NOTE: It is also possible to view a string as a sequence of grapheme clusters. Grapheme clusters divide the text into units that correspond more closely than character strings to the user's perception of where character boundaries occur in a visually rendered text. A discussion of grapheme clusters is given at the end of Section 2.10 of the Unicode Standard, Version 4 [Unicode 4.0]; a formal definition is given in Unicode Standard Annex #29 [UTR #29]. The Unicode Standard defines default grapheme clustering. Some languages require tailoring to this default. For example, a Slovak user might wish to treat the default pair of grapheme clusters "ch" as a single grapheme cluster. Note that the interaction between the language of string content and the end-user's preferences may be complex.
</span></p>

	</section>
	<section id="sec-stringIndexing">

<h3 title="String indexing">6.2. 文字列の付番</h3>

<p>
ソフトウェアの処理が 部分文字列, あるいは文字列の中の一点へのアクセスを必要とし，
<dfn id="indices">index</dfn>
—
すなわち，数値による文字列の中の “位置”
—
を利用してそれを行う状況は頻繁にある。
その種の index がウェブ上のコンポーネント間でやりとりされても，ふるまいの一貫性が保たれるためには、文字列の付番（文字列の中の各位置への番号の振り方）について予め合意される必要がある。
文字列の付番のための要件は
“文字列の同一性合致検出のための要件”
—
<cite>Requirements for String Identity Matching</cite>
<a href="#CharReq">[CharReq]</a>,
<a href="http://www.w3.org/TR/WD-charreq#4">4 節</a>
にて論じられる。
主要な問いは２つある：
(1) “どのような単位に基づいて数えるか？”
(2) “0 か 1 どちらを起点にするか？”
<span lang="en">
There are many situations where a software process needs to access a substring or to point within a string and does so by the use of indices, i.e. numeric "positions" within a string. Where such indices are exchanged between components of the Web, there is a need for an agreed-upon definition of string indexing in order to ensure consistent behavior. The requirements for string indexing are discussed in Requirements for String Identity Matching [CharReq], section 4. The two main questions that arise are: "What is the unit of counting?" and "Do we start counting at 0 or 1?".
</span></p>

<p>
前の 6.1 節：<a href="#sec-Strings"><b>文字列の概念</b></a>
では、文字列が
<a href="#def-character-string">文字文字列</a>,
<a href="#def-physical-string">符号単位文字列</a>,
<a href="#def-byte-string">バイト文字列</a>
として捉えられることを示した。
そのそれぞれが異なる単位に基づく付番を孕んでいる。
<span lang="en">
The example in the previous section, 6.1 String concepts, shows a string viewed as a character string, code unit string and byte string, respectively, each of which involves different units for indexing.
</span></p>

<p>
処理の特定の要件に依存して、数え方の単位は
6.1 節：<a href="#sec-Strings"><b>文字列の概念</b></a>
にて与えられた，文字列の種々の定義に対応し得る。
特に：
<span lang="en">
Depending on the particular requirements of a process, the unit of counting may correspond to definitions of a string provided in section 6.1 String concepts. In particular:
</span></p>

<ul>
	<li>
<div class="req"><h5 id="C051">C051<i>[仕様]</i><i>[実装]</i></h5>
<p>
文字列の付番は、
<a href="#def-character-string">文字文字列</a>
に基づくものが<em class="rfc2119">推奨される</em>。
<span lang="en">
C051 [S] [I] The character string is RECOMMENDED as a basis for string indexing.
</span></p></div>

<p>
（例： XML Path 言語
<a href="#xpath">[XPath]</a>
）。
<span lang="en">
(Example: the XML Path Language [XPath]).
</span></p>

	</li>
	<li>
<div class="req"><h5 id="C052">C052<i>[仕様]</i><i>[実装]</i></h5>
<p>
内部演算の効率性が
<a href="#def-character-string">文字文字列</a>
に基づく付番に比して大幅に向上する場合は、
<a href="#def-physical-string">符号単位文字列</a>
に基づく文字列の付番が利用されても<em class="rfc2119">よい</em>。
<span lang="en">
C052 [S] [I] A code unit string MAY be used as a basis for string indexing if this results in a significant improvement in the efficiency of internal operations when compared to the use of character string.
</span></p></div>

<p>
（例：
<a href="#dom1">[DOM Level 1]</a>
における UTF-16 の利用）。
<span lang="en">
(Example: the use of UTF-16 in [DOM Level 1]).
</span></p>
	</li>
	<li>

<div class="req"><h5 id="C071">C071<i>[仕様]</i><i>[実装]</i></h5>
<p>
利用者との対話が主要な部分を占めるアプリケーションにおいては、
<a href="#def-grapheme-string">書記素クラスタ</a>
に基づく文字列の付番が利用されても<em class="rfc2119">よい</em>。
<span lang="en">
C071 [S] [I] Grapheme clusters MAY be used as a basis for string indexing in applications where user interaction is the primary concern.
</span></p></div>

<p>
Unicode 標準 Annex #29, Text Boundaries
<a href="#UTR29">[UTR #29]</a>
を参照。
<span lang="en">
See Unicode Standard Annex #29, Text Boundaries [UTR #29].
</span></p>

<div class="req"><h5 id="C074">C074<i>[仕様]</i></h5>
<p>
書記素クラスタを通して付番を定義する仕様は、次のいずれかにより定義しなければ<em class="rfc2119">ならない</em>：
a)
Unicode 標準 Annex #29, Text Boundaries
<a href="#UTR29">[UTR #29]</a>
にて定義される既定の書記素クラスタを通して，書記素クラスタを定義する,
または
b)
付番演算を，より適切な形に<em >あつらえる</em>方法を特に詳細に規定する。
<span lang="en">
C074 [S]Specifications that define indexing in terms of grapheme clusters MUST either: a) define grapheme clusters in terms of default grapheme clusters as defined in Unicode Standard Annex #29, Text Boundaries [UTR #29], or b) define specifically how tailoring is applied to the indexing operation.
</span></p></div>

	</li>
	<li>
<div class="req"><h5 id="C072">C072<i>[仕様]</i><i>[実装]</i></h5>
<p>
<a href="#def-byte-string">バイト文字列</a>
に基づく付番は<em class="rfc2119">推奨されない</em>。
<span lang="en">
C072 [S] [I] The use of byte strings for indexing is NOT RECOMMENDED.
</span></p></div></li>
</ul>

<hr>

<p>
数値を通さずに部分文字列を識別しつつ，都合の良い特性を備えるような、特記すべき方法もある。
例えば、文字列合致検出に基づく部分文字列は，小さな編集に対し十全に堅牢であり、文書構造に基づく部分文字列（ XML のような構造的なデータ形式）は，編集に対し, あるいは別の自然言語への翻訳においてすらも，より堅牢である。
<span lang="en">
It is noteworthy that there exist other, non-numeric ways of identifying substrings which have favorable properties. For instance, substrings based on string matching are quite robust against small edits; substrings based on document structure (in structured formats such as XML) are even more robust against edits and even against translation of a document from one human language to another.
</span></p>

<div class="req"><h5 id="C053">C053<i>[仕様]</i></h5>
<p>
部分文字列や 文字列内の一点を識別する方法を必要とする仕様は、この演算を遂行するための，文字列の付番 以外の方法も供する<em class="rfc2119">べき</em>である。
<span lang="en">
C053 [S] Specifications that need a way to identify substrings or point within a string SHOULD provide ways other than string indexing to perform this operation.
</span></p></div>

<div class="req"><h5 id="C054">C054<i>[実装]</i><i>[内容]</i></h5>
<p>
仕様の利用者（ソフトウェア開発者, 内容開発者）は、可能な所では，部分文字列または文字列内の一点を識別するための，文字列の付番 以外の方法を選ぶ<em class="rfc2119">べき</em>である。
<span lang="en">
C054 [I] [C] Users of specifications (software developers, content developers) SHOULD whenever possible prefer ways other than string indexing to identify substrings or point within a string.
</span></p></div>

<p>
経験から、個々の文字は，部分文字列の前／後の位置から識別される部分文字列として解された上で処理された方が、より 汎用, 柔軟, 堅牢 な仕様になることが判っている。
index を数え方の単位の<em>狭間</em>の位置と解することで、異なる文字列定義による index にも関連付け易くなる。
<span lang="en">
Experience shows that more general, flexible and robust specifications result when individual characters are understood and processed as substrings, identified by a position before and a position after the substring. Understanding indices as boundary positions between the counting units also makes it easier to relate the indices resulting from the different string definitions.
</span></p>

<div class="req"><h5 id="C055">C055<i>[仕様]</i></h5>
<p>
仕様は、１個の文字を部分文字列として解した上で処理し，選定されている数え方の単位に関わらず， index をその単位の<em>狭間</em>の位置として扱う<em class="rfc2119">べき</em>である。
<span lang="en">
C055 [S] Specifications SHOULD understand and process single characters as substrings, and treat indices as boundary positions between counting units, regardless of the choice of counting units.
</span></p></div>

<div class="req"><h5 id="C056">C056<i>[仕様]</i></h5>
<p>
API の仕様は、１個の文字や１個の
“<span class="qterm">符号化単位</span>”
を，引数や返り値の型に指定する<em class="rfc2119">べき</em>でない。
<span lang="en">
C056 [S] Specifications of APIs SHOULD NOT specify single characters or single 'units of encoding' as argument or return types.
</span></p></div>

<p class="example"><b>例：</b>
仮に “大文字化” 関数 <code>uppercase</code> の返り値型が１個の文字として定義された場合、 <code>uppercase("ß")</code> は，正しい結果（長さ２の文字文字列
<span class="qchar">SS</span>
）を返せなくなる。
<span class="trans-note">【
現在の Unicode には "ß" の大文字 （ "ẞ" ）もあるので，この例は文脈によっては適切でないかもしれない。
】</span>
また，
3 節：<a href="#sec-Perceptions"><b>文字の知覚</b></a>
にて述べたように、文字と，声音／入力 の単位, 等々の間に一対一の対応関係が成り立つとは限らないことにも注意。
<span lang="en">
EXAMPLE: The function uppercase("ß") cannot return the proper result (the two-character string 'SS') if the return type of the uppercase function is defined to be a single character. Note, also, that there is not necessarily a one-to-one mapping between characters and units of sound, input, etc. as described in 3 Perceptions of Characters.
</span></p>

<p>
index の起点，すなわち 0, 1 のいずれから数えるかについて
—
この問題は、実際には，単位それ自身を数えるか 単位の狭間の位置を数えるかの裁定が下された後にのみ，発生する。
<span lang="en">
The issue of index origin, i.e. whether we count from 0 or 1, actually arises only after a decision has been made on whether it is the units themselves that are counted or the positions between the units.
</span></p>

<div class="req"><h5 id="C057">C057<i>[仕様]</i></h5>
<p>
文字列の付番において単位の狭間の位置が数えられる場合、文字列の始端を index 0 と定めることが<em class="rfc2119">推奨される</em>。
その場合、最後の index は，文字列に含まれる単位の個数になる。
<span lang="en">
C057 [S] When the positions between the units are counted for string indexing, starting with an index of 0 for the position at the start of the string is the RECOMMENDED solution, with the last index then being equal to the number of counting units in the string.
</span></p></div>

	</section>
</section>
<section id="sec-RefUnicode">

<h2 title="Referencing the Unicode Standard and ISO/IEC 10646">7. Unicode 標準や ISO/IEC 10646 を参照するとき</h2>

<p>
仕様はしばしば， Unicode 標準や国際標準 ISO/IEC 10646 を参照する必要が生じる。
特に，規範として参照するときは、注意深く行われなければならない。
考慮されるべき問いは：
<span lang="en">
Specifications often need to make references to the Unicode Standard or International Standard ISO/IEC 10646. Such references must be made with care, especially when normative. The questions to be considered are:
</span></p>

<ul>
	<li>
どの標準が参照されるべきか？
<span lang="en">
Which standard should be referenced?
</span></li>
	<li>
特定のバージョンはどのように参照するのか？
<span lang="en">
How to reference a particular version?
</span></li>
	<li>
どのようなときに，バージョン付きのものそうでないものを利用するのか？
<span lang="en">
When to use versioned vs. unversioned references?
</span></li>
</ul>

<hr>

<p>
ISO/IEC 10646 は
<abbr title="International Organization for Standardization"><a href="http://www.iso.org/iso/en/ISOOnline.openerpage">ISO</a></abbr>
（国際標準化機構）と
<abbr title="International Electrotechnical Commission"><a href="http://www.iec.ch/">IEC</a></abbr>
（国際電気標準会議）の協同により，開発され, 発行された。
Unicode 標準は
<a href="https://www.unicode.org/">Unicode Consortium</a>
（主要な コンピュータ企業, ソフトウェア製作者, データベースベンダ, 国, 研究機関, 国際的機関, 種々の利用者グループや関心を持つ個人，からなる組織）により開発され，発行された。
Unicode 標準は、 W3C 勧告に相当する位置付けのものである。
<span lang="en">
ISO/IEC 10646 is developed and published jointly by ISO (the International Organization for Standardization) and IEC (the International Electrotechnical Commission). The Unicode Standard is developed and published by the Unicode Consortium, an organization of major computer corporations, software producers, database vendors, national governments, research institutions, international agencies, various user groups, and interested individuals. The Unicode Standard is comparable in standing to W3C Recommendations.
</span></p>

<p>
ISO/IEC 10646 と Unicode 標準は、きっかり同じ
<a href="#def-CCS">符号化文字集合 (CCS)</a>
（同じ
<a href="#def-repertoire">レパートリ</a>,
同じ
<a href="#def-CCS">符号位置</a>
）と符号化形式を定義する。
それらは、それぞれの技術委員会の間の情報交換やメンバの重なり合いの中で活動的に保守されている。
共同で定義された CCS と符号化形式に加えて， Unicode 標準では［
文字 特性の規範的な（および参考情報の）一覧,
文字の 等価性／正規化 のための規範的な仕様,
双方向テキストのための規範的なアルゴリズム,
実装に有用な数多の情報
］も追加している。
要約すると、 Unicode 標準は ISO/IEC 10646 が単に列挙している文字に意味論を追加している。
Unicode 標準への適合は ISO/IEC 10646 への適合も意味する。
<a href="#unicode40">[Unicode  4.0]</a>
Appendix C を参照。
<span lang="en">
ISO/IEC 10646 and the Unicode Standard define exactly the same coded character set (CCS) (same repertoire, same code points) and encoding forms. They are actively maintained in synchrony by liaisons and overlapping membership between the respective technical committees. In addition to the jointly defined CCS and encoding forms, the Unicode Standard adds normative and informative lists of character properties, normative character equivalence and normalization specifications, a normative algorithm for bidirectional text and a large amount of useful implementation information. In short, the Unicode Standard adds semantics to the characters that ISO/IEC 10646 merely enumerates. Conformance to the Unicode Standard implies conformance to ISO/IEC 10646, see [Unicode 4.0] Appendix C.
</span></p>

<div class="req"><h5 id="C062">C062<i>[仕様]</i></h5>
<p>
一般に、仕様は，利用する文字とそれに結び付けられている意味論を定義する必要があるので、
ISO/IEC 10646 への参照を含めるかどうかに関わらず、 Unicode 標準への参照を含める<em class="rfc2119">べき</em>である。
<span lang="en">
C062 [S] Since specifications in general need both a definition for their characters and the semantics associated with these characters, specifications SHOULD include a reference to the Unicode Standard, whether or not they include a reference to ISO/IEC 10646.
</span></p></div>

<p>
Unicode 標準への参照を供することにより、実装者は，その標準, および Unicode Consortium ウェブサイトが提供する豊富な情報による恩恵を得られる。
<span lang="en">
By providing a reference to the Unicode Standard implementers can benefit from the wealth of information provided in the standard and on the Unicode Consortium Web site.
</span></p>

<p>
ISO/IEC 10646 と Unicode 標準は（同調的に）発展し続けているので、バージョン付けの課題も生じる：
仕様は、標準の特定のバージョンを参照すべきか？
それとも，規範的な参照が仕様が<em>読まれている</em>時点のバージョンを指すようにするために，総称的な参照にすべきか？
—
一般に、その答えは<em>両者</em>である。
<span lang="en">
The fact that both ISO/IEC 10646 and the Unicode Standard are evolving (in synchrony) raises the issue of versioning: should a specification refer to a specific version of the standard, or should it make a generic reference, so that the normative reference is to the version current at the time of reading the specification? In general the answer is both.
</span></p>

<div class="req"><h5 id="C063">C063<i>[仕様]</i></h5>
<p>
仕様の発行後に割り当てられた文字が，その仕様で利用可能になることが望まれる場合には、
Unicode 標準への総称的参照が含められなければ<em class="rfc2119">ならない</em>。
特定のバージョンに依存する機能がいつでも利用可能, かつ時の経過に伴って変更されないことを確保するために、 Unicode 標準への特定の参照が含められても<em class="rfc2119">よい</em>。
<span lang="en">
C063 [S] A generic reference to the Unicode Standard MUST be made if it is desired that characters allocated after a specification is published are usable with that specification. A specific reference to the Unicode Standard MAY be included to ensure that functionality depending on a particular version is available and will not change over time.
</span></p></div>

<p>
例えば、 XML 1.0
<a href="#xml10">[XML 1.0]</a>
において，構文解析器が名前の妥当性を検証する際に実装しなければならない， Name 文字が受容し得る文字の集合の列挙一覧。
<span lang="en">
An example would be the set of characters acceptable as Name characters in XML 1.0 [XML 1.0], which is an enumerated list that parsers must implement to validate names.
</span></p>

<p class="note"><b>注記：</b>
Unicode 標準の特定のバージョンを参照するための指針については
<a href="https://www.unicode.org/unicode/standard/versions/#Citations">https://www.unicode.org/unicode/standard/versions/#Citations</a>
を参照。
<span lang="en">
NOTE: See http://www.unicode.org/unicode/standard/versions/#Citations for guidance on referring to specific versions of the Unicode Standard.
</span></p>

<p>
総称的参照の公式的な方法には，次の２つがある：
<span lang="en">
A generic reference can be formulated in two ways:
</span></p>

<ol>
	<li>
仕様の参照文献の節に “<em>総称的</em>” 項目を明示的に含ませた上で、単に仕様の本文から，その項目を参照する。
その種の総称的項目には
“…これは時の経過に伴い改訂または修正が加えられ得る”
のような類いのテキストを含ませる。
<span lang="en">
By explicitly including a generic entry in the bibliography section of a specification and simply referring to that entry in the body of the specification. Such a generic entry contains text such as "... as it may from time to time be revised or amended".
</span></li>
	<li>
参照文献に “<em>特定的</em>” 項目を含ませた下では、仕様の本文の中で参照を与える所に，
“…これは時の経過に伴い改訂または修正が加えられ得るので…”
のような類いのテキストを含ませる。
<span lang="en">
By including a specific entry in the bibliography and adding text such as "... as it may from time to time be revised or amended" at the point of reference in the body of the specification.
</span></li>
</ol>

<hr>

<p>
これら２つの公式化のいずれが利用されるかは、それぞれの仕様の編集の裁量に委ねられる。
最初の公式化の例は、この仕様の参照文献にて見られる（
<a href="#iso10646">[ISO/IEC 10646]</a>
と
<a href="#unicode">[Unicode]</a>
の項目を見よ）。
後者の例は、
<a href="#rfc3629">[RFC 3629]</a>
と
<a href="#rfc2781">[RFC 2781]</a>
にて， UCS 符号化法のための
MIME <code class="keyword">charset</code>
パラメタに対するバージョン付けについての論も伴われた形で見られる。
<span lang="en">
It is an editorial matter, best left to each specification, which of these two formulations is used. Examples of the first formulation can be found in the bibliography of this specification (see the entries for [ISO/IEC 10646] and [Unicode]). Examples of the latter, as well as a discussion of the versioning issue with respect to MIME charset parameters for UCS encodings, can be found in [RFC 3629] and [RFC 2781].
</span></p>

<div class="req"><h5 id="C064">C064<i>[仕様]</i></h5>
<p>
Unicode 標準
<a href="#unicode">[Unicode]</a>
へのすべての<em>総称的</em>参照は、仕様の発行日に利用可能な Unicode 標準の最新のバージョンを参照しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C064 [S] All generic references to the Unicode Standard [Unicode] MUST refer to the latest version of the Unicode Standard available at the date of publication of the containing specification.
</span></p></div>

<div class="req"><h5 id="C065">C065<i>[仕様]</i></h5>
<p>
ISO/IEC 10646
<a href="#iso10646">[ISO/IEC 10646]</a>
へのすべての<em>総称的</em>参照は、仕様の発行日に利用可能な ISO/IEC 10646 の最新のバージョンを参照しなければ<em class="rfc2119">ならない</em>。
<span lang="en">
C065 [S] All generic references to ISO/IEC 10646 [ISO/IEC 10646] MUST refer to the latest version of ISO/IEC 10646 available at the date of publication of the containing specification.
</span></p></div>

</section>


<section id="sec-References">

<h2 title="References">A. 参照文献<!--1--></h2>

	<section id="sec-NormativeReferences">

<h3 title="Normative References">A.1. 参照文献<!--1-->（規範的）</h3>

<dl>
	
	<dt id="iana">[IANA]</dt>
	<dd>
Internet Assigned Numbers Authority,
<a href="http://www.iana.org/assignments/character-sets"><cite>Official Names
for Character Sets</cite></a>. (See 
<a href="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a>.)
</dd>
	<dt id="iso10646">[ISO/IEC 10646]</dt>
	<dd>
ISO/IEC 10646:2003,
<a href="http://www.iso.ch/iso/en/CatalogueDetailPage.CatalogueDetail?CSNUMBER=39921"><cite>Information
technology -- Universal Multiple-Octet Coded Character Set (UCS)</cite></a>,
as, from time to time, amended, replaced by a
new edition or expanded by the addition of new parts. (See 
<a href="http://www.iso.org/iso/en/ISOOnline.openerpage">http://www.iso.org/iso/en/ISOOnline.openerpage</a> for the
latest version.)
</dd>
	<dt id="MIME-entity">[MIME-entity]</dt>
	<dd>
N. Freed, N. Borenstein,
<a href="http://www.ietf.org/rfc/rfc2045.txt"><cite>Multipurpose Internet Mail
Extensions (MIME). Part One: Format of Internet Message Bodies</cite></a>,
RFC 2045, November 1996, 
<a href="http://www.ietf.org/rfc/rfc2045.txt">http://www.ietf.org/rfc/rfc2045.txt</a>.
</dd>

	<dt id="MIME-charset">[MIME-charset]</dt>
	<dd>
<a href="http://www.ietf.org/rfc/rfc2046.txt"><cite>Multipurpose Internet Mail
Extensions (MIME). Part Two: Media Types</cite></a>,
N. Freed, N. Borenstein, RFC 2046,
November 1996, <a href="http://www.ietf.org/rfc/rfc2046.txt">http://www.ietf.org/rfc/rfc2046.txt</a>.
</dd>
	<dt id="rfc2119">[RFC 2119]</dt>
	<dd>
S. Bradner,
<a href="http://www.ietf.org/rfc/rfc2119.txt"><cite>Key words for use in RFCs
to Indicate Requirement Levels</cite></a>,
IETF RFC 2119. (See 
<a href="http://www.ietf.org/rfc/rfc2119.txt">http://www.ietf.org/rfc/rfc2119.txt</a>.)
</dd>
	<dt id="unicode">[Unicode]</dt>
	<dd>
The Unicode Consortium,
<a href="http://www.unicode.org/unicode/standard/versions/"><cite>The Unicode Standard, Version 4</cite></a>,
ISBN 0-321-18578-1, as
updated from time to time by the publication of new versions. (See 
<a href="http://www.unicode.org/unicode/standard/versions/">http://www.unicode.org/unicode/standard/versions</a>
for the latest version and additional information on versions of the standard
and of the Unicode Character Database).
</dd>
	<dt id="unicode32">[Unicode 3.2]</dt>
	<dd>
The Unicode Consortium,
<a href="http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode_3_2_0"><cite>The Unicode Standard, Version 3.2.0</cite></a> is defined by
<a href="http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode_3_0_0"><cite>The Unicode Standard, Version 3.0</cite></a> (Reading, MA,
Addison-Wesley, 2000. ISBN 0-201-61633-5), as amended by the <a href="http://www.unicode.org/reports/tr27/"><cite>Unicode
Standard Annex #27: Unicode 3.1</cite></a> (see 
<a href="http://www.unicode.org/reports/tr27/">http://www.unicode.org/reports/tr27</a>)
and by the <a href="http://www.unicode.org/reports/tr28/"><cite>Unicode Standard Annex #28: Unicode 3.2</cite></a> (see 
<a href="http://www.unicode.org/reports/tr28/">http://www.unicode.org/reports/tr28</a>).
</dd>
	<dt id="unicode40">[Unicode 4.0]</dt>
	<dd>
The Unicode Consortium.
<a href="http://www.unicode.org/versions/Unicode4.0.0/"><cite>The Unicode Standard, Version 4.0</cite></a>,
Reading, MA, Addison-Wesley, 2003. ISBN 0-321-18578-1. (See <a href="http://www.unicode.org/versions/Unicode4.0.0/">http://www.unicode.org/versions/Unicode4.0.0/</a>)
</dd></dl>

	</section>
	<section id="sec-OtherReferences">

<h3 title="Other References">A.2. 他の参照文献<!--1--></h3>

<dl>
	<dt id="charnorm">[CharNorm]</dt>
	<dd>
Martin J. Dürst,
François Yergeau, Richard Ishida, Misha Wolf, Tex Texin, Addison Phillips
<a href="http://www.w3.org/TR/charmod-norm/"><cite>Character Model for the World Wide Web 1.0: Normalization</cite></a>,
W3C Working Draft. (See 
<a href="http://www.w3.org/TR/charmod-norm/">http://www.w3.org/TR/charmod-norm</a>.)
</dd>
	<dt id="charmod3">[CharIRI]</dt>
	<dd>
Martin J. Dürst, François Yergeau, Richard Ishida, Misha Wolf, Tex Texin,
<a href="http://www.w3.org/TR/charmod-resid/"><cite>Character Model for the World Wide Web 1.0: Resource Identifiers</cite></a>,
W3C Candidate Recommendation. (See
<a href="http://www.w3.org/TR/charmod-resid/">http://www.w3.org/TR/charmod-resid</a>.)
</dd>
	<dt id="CharReq">[CharReq]</dt>
	<dd>
Martin J. Dürst,
<a href="http://www.w3.org/TR/WD-charreq"><cite>Requirements for String
Identity Matching and String Indexing</cite></a>,
W3C Working Draft. (See 
<a href="http://www.w3.org/TR/WD-charreq">http://www.w3.org/TR/WD-charreq</a>.)
</dd>
	<dt id="connolly">[Connolly]</dt>
	<dd>
D. Connolly,
<a href="http://www.w3.org/MarkUp/html-spec/charset-harmful"><cite>Character
Set Considered Harmful</cite></a>,
W3C Note. (See 
<a href="http://www.w3.org/MarkUp/html-spec/charset-harmful">http://www.w3.org/MarkUp/html-spec/charset-harmful</a>.)
</dd>
	<dt id="css2">[CSS21]</dt>
	<dd>
Bert Bos, Håkon Wium Lie, Chris Lilley,
Ian Jacobs, Eds.,
<a href="http://www.w3.org/TR/REC-CSS2/"><cite>Cascading
Style Sheets, level 2</cite></a> (CSS2 Specification), W3C Recommendation. (See
<a href="http://www.w3.org/TR/REC-CSS2/">http://www.w3.org/TR/REC-CSS2</a>.)
</dd>
	<dt id="dom1">[DOM Level 1]</dt>
	<dd>
Vidur Apparao et al.,
<a href="http://www.w3.org/TR/REC-DOM-Level-1/"><cite>Document Object Model
(DOM) Level 1 Specification</cite></a>,
W3C Recommendation. (See
<a href="http://www.w3.org/TR/REC-DOM-Level-1/">http://www.w3.org/TR/REC-DOM-Level-1</a>.)
</dd>
	<dt id="html40">[HTML 4.0]</dt>
	<dd>
Dave Raggett, Arnaud Le Hors, Ian
Jacobs, Eds.,
<a href="http://www.w3.org/TR/REC-html40-971218/"><cite>HTML 4.0
Specification</cite></a>,
W3C Recommendation, 18-Dec-1997 (See
<a href="http://www.w3.org/TR/REC-html40-971218/">http://www.w3.org/TR/REC-html40-971218</a>.)
</dd>
	<dt id="html401">[HTML 4.01]</dt>
	<dd>
Dave Raggett, Arnaud Le Hors, Ian
Jacobs, Eds.,
<a href="http://www.w3.org/TR/html401/"><cite>HTML 4.01
Specification</cite></a>,
W3C Recommendation. (See
<a href="http://www.w3.org/TR/html401/">http://www.w3.org/TR/html401</a>.)
</dd>
	<dt id="iso646">[ISO/IEC 646]</dt>
	<dd>
ISO/IEC 646:1991, <cite>Information technology -- ISO 7-bit coded character set for information interchange</cite>.  This standard defines an International Reference Version (IRV) which corresponds exactly to what is widely known as ASCII or US-ASCII.  ISO/IEC 646 was based on the earlier standard ECMA-6. ECMA has maintained its standard up to date with respect to ISO/IEC 646 and makes an electronic copy available at
<a href="http://www.ecma-international.org/publications/standards/Ecma-006.htm">http://www.ecma-international.org/publications/standards/Ecma-006.htm</a>
</dd>
	<dt id="iso9541">[ISO/IEC 9541-1]</dt>
	<dd>
ISO/IEC 9541-1:1991,
<a href="http://www.iso.ch/iso/en/CatalogueDetailPage.CatalogueDetail?CSNUMBER=17277"><cite>Information
technology -- Font information interchange -- Part 1: Architecture</cite></a>. (See 
<a href="http://www.iso.ch/iso/en/CatalogueDetailPage.CatalogueDetail?CSNUMBER=17277">http://www.iso.ch/iso/en/CatalogueDetailPage.CatalogueDetail?CSNUMBER=17277</a>
for the latest version.)
</dd>
	<dt id="iso14651">[ISO/IEC 14651]</dt>
	<dd>
ISO/IEC 14651:2000,
<a href="http://www.iso.org/iso/en/ISOOnline.openerpage"><cite>Information technology --
International string ordering and comparison -- Method for comparing character
strings and description of the common template tailorable ordering</cite></a> as,
from time to time, amended, replaced by a new edition or expanded by the
addition of new parts. (See 
<a href="http://www.iso.org/iso/en/ISOOnline.openerpage">http://www.iso.org/iso/en/ISOOnline.openerpage</a> for the
latest version.)
</dd>
	<dt id="mathml2">[MathML2]</dt>
	<dd>
David Carlisle, Patrick Ion, Robert
Miner, Nico Poppelier, Eds.,
<a href="http://www.w3.org/TR/MathML2/"><cite>Mathematical Markup Language (MathML)
Version 2.0</cite></a>,
W3C Recommendation. (See 
<a href="http://www.w3.org/TR/MathML2/">http://www.w3.org/TR/MathML2</a>.)
</dd>
	<dt id="Nicol">[Nicol]</dt>
	<dd>
Gavin Nicol,
<a href="http://www.mind-to-mind.com/library/papers/multilingual/multilingual-www.html"><cite>The
Multilingual World Wide Web</cite></a>,
Chapter 2: The WWW As A Multilingual
Application. (See 
<a href="http://www.mind-to-mind.com/library/papers/multilingual/multilingual-www.html">http://www.mind-to-mind.com/library/papers/multilingual/multilingual-www.html</a>.)
</dd>
	<dt id="rfc2070">[RFC 2070]</dt>
	<dd>
F. Yergeau, G. Nicol, G. Adams, M.
Dürst,
<a href="http://www.ietf.org/rfc/rfc2070.txt"><cite>Internationalization of the
Hypertext Markup Language</cite></a>,
IETF RFC 2070, January 1997. (See 
<a href="http://www.ietf.org/rfc/rfc2070.txt">http://www.ietf.org/rfc/rfc2070.txt</a>.)
</dd>
	<dt id="rfc2277">[RFC 2277]</dt>
	<dd>
H. Alvestrand,
<a href="http://www.ietf.org/rfc/rfc2277.txt"><cite>IETF Policy on Character
Sets and Languages</cite></a>,
IETF RFC 2277, BCP 18, January 1998. (See 
<a href="http://www.ietf.org/rfc/rfc2277.txt">http://www.ietf.org/rfc/rfc2277.txt</a>.)
</dd>
	<dt id="RFC2978">[RFC 2978]</dt>
	<dd>
N. Freed, J. Postel,
<a href="http://www.ietf.org/rfc/rfc3629.txt"><cite>IANA Charset Registration Procedures</cite></a>,
IETF RFC 2978, BCP 19, October 2000. (See 
<a href="http://www.ietf.org/rfc/rfc2978.txt">http://www.ietf.org/rfc/rfc2978.txt</a>.)
</dd>
	<dt id="rfc3629">[RFC 3629]</dt>
	<dd>
F. Yergeau,
<a href="http://www.ietf.org/rfc/rfc3629.txt"><cite>UTF-8, a transformation
format of ISO 10646</cite></a>,
IETF RFC 3629, STD 63, November 2003. (See 
<a href="http://www.ietf.org/rfc/rfc3629.txt">http://www.ietf.org/rfc/rfc3629.txt</a>.)
</dd>
	<dt id="rfc2781">[RFC 2781]</dt>
	<dd>
P. Hoffman, F. Yergeau,
<a href="http://www.ietf.org/rfc/rfc2781.txt"><cite>UTF-16, an encoding of ISO
10646</cite></a>,
IETF RFC 2781, February 2000. (See 
<a href="http://www.ietf.org/rfc/rfc2781.txt">http://www.ietf.org/rfc/rfc2781.txt</a>.)
</dd>
	<dt id="spread">[SPREAD]</dt>
	<dd>
<a href="http://xml.ascc.net/resource/entities/index.html"><cite>SPREAD -
Standardization Project for East Asian Documents Universal Public Entity
Set</cite></a>. (See 
<a href="http://xml.ascc.net/resource/entities/index.html">http://www.ascc.net/xml/resource/entities/index.html</a>)
</dd>
	<dt id="svg">[SVG]</dt>
	<dd>
Jon Ferraiolo,  藤沢 淳 (FUJISAWA Jun),  Dean Jackson, Ed.,
<a href="http://www.w3.org/TR/SVG/"><cite>Scalable Vector Graphics (SVG) 1.1
Specification</cite></a>,
W3C Recommendation. (See 
<a href="http://www.w3.org/TR/SVG/">http://www.w3.org/TR/SVG</a>.)
</dd>
	<dt id="UTR10">[UTR #10]</dt>
	<dd>
Mark Davis, Ken Whistler,
<a href="http://www.unicode.org/unicode/reports/tr10/"><cite>Unicode Collation
Algorithm</cite></a>,
Unicode Technical Report #10. (See 
<a href="http://www.unicode.org/unicode/reports/tr10/">http://www.unicode.org/unicode/reports/tr10</a>.)
</dd>
	<dt id="UTR17">[UTR #17]</dt>
	<dd>
Ken Whistler, Mark Davis,
<a href="http://www.unicode.org/unicode/reports/tr17/"><cite>Character
Encoding Model</cite></a>,
Unicode Technical Report #17. (See 
<a href="http://www.unicode.org/unicode/reports/tr17/">http://www.unicode.org/unicode/reports/tr17</a>.)
</dd>
	<dt id="UTR29">[UTR #29]</dt>
	<dd>
Mark Davis,
<a href="http://www.unicode.org/unicode/reports/tr29/"><cite>Text Boundaries</cite></a>,
Unicode Standard Annex #29. (See 
<a href="http://www.unicode.org/unicode/reports/tr29/">http://www.unicode.org/unicode/reports/tr29</a>
for the latest version).
</dd>
	<dt id="UXML">[UXML]</dt>
	<dd>
Martin Dürst and Asmus Freytag,
<a href="http://www.w3.org/TR/unicode-xml/"><cite>Unicode in XML and other
Markup Languages</cite></a>,
Unicode Technical Report #20 and W3C Note. (See
<a href="http://www.w3.org/TR/unicode-xml/">http://www.w3.org/TR/unicode-xml</a>.)
</dd>
	<dt id="xml10">[XML 1.0]</dt>
	<dd>
Tim Bray, Jean Paoli, C. M.
Sperberg-McQueen, Eve Maler, François Yergeau, Eds.,
<a href="http://www.w3.org/TR/REC-xml/"><cite>Extensible Markup Language (XML)
1.0</cite></a>,
W3C Recommendation. (See
<a href="http://www.w3.org/TR/REC-xml/">http://www.w3.org/TR/REC-xml</a>.)
</dd>
	<dt id="XML_Japanese_profile">[XML Japanese Profile]</dt>
	<dd>
MURATA
Makoto Ed.,
<a href="http://www.w3.org/TR/japanese-xml/"><cite>XML Japanese
Profile</cite></a>,
W3C Note. (See 
<a href="http://www.w3.org/TR/japanese-xml/">http://www.w3.org/TR/japanese-xml</a>.)
</dd>
	<dt id="xpath">[XPath]</dt>
	<dd>
James Clark, Steve DeRose, Eds,
<a href="http://www.w3.org/TR/xpath"><cite>XML Path Language (XPath) Version
1.0</cite></a>,
W3C Recommendation. (See
<a href="http://www.w3.org/TR/xpath">http://www.w3.org/TR/xpath</a>.)
</dd>
</dl>

	</section>
</section>
<section id="sec-CharExamples">

<h2 title="Examples of Characters, Keystrokes and Glyphs (Non-Normative)">B. 文字, キーストローク, グリフの例（参考）</h2>

<p id="exampleA6">
コンピュータにおけるこのようなテキストの複雑さ（そのほとんどが人々の表記体系の複雑さを反映するもの）の全体像の把握に役立つであろう，少数の例を挙げる。
<span lang="en">
A few examples will help make sense all this complexity of text in computers (which is mostly a reflection of the complexity of human writing systems). Let us start with a very simple example: a user, equipped with a US-English keyboard, types "Foo", which the computer encodes as 16-bit values (the UTF-16 encoding of Unicode) and displays on the screen.
</span></p>

<p>
最初はごく単純な例から始める：
US-English キーボードを使う利用者が
“<kbd><kbd>F</kbd><kbd>o</kbd><kbd>o</kbd></kbd>”
とタイプするとする。
コンピュータでは 16-bit 値として符号化され（ Unicode の UTF-16 符号化法）, スクリーンに表示される。
<span lang="en">↑</span></p>

<table>
<caption >
Basic Latin の例
<span class="summary">（ U.S. キーボードで
<kbd><kbd>F</kbd><kbd>o</kbd><kbd>o</kbd></kbd>
がタイプされたときのキーストローク, 入力文字, 符号化された文字, その表示の一覧）</span>
<span lang="en">
Example: Basic Latin
(Table showing keystrokes, input characters, encoded characters and display for user typing Foo on a U.S. keyboard)
</span></caption>

<tbody>
<tr class="kbd"><th>キーストローク<td>Shift-f<td>o<td>o
<tr><th>入力文字<td>F<td>o<td>o
<tr><th>符号化文字（16 進バイト値）<td>0046<td>006F<td>006F
<tr><th>テキスト<td colspan="3"><samp><kbd>Foo</kbd></samp>
</tbody></table>


<p>
唯一，大文字の
<samp><kbd>F</kbd></samp>
を入力するために修飾キー（ Shift ）を利用する所だけ、単純でない。
<span lang="en">
The only complexity here is the use of a modifier (Shift) to input the capital 'F'.
</span></p>

<p>
もう少し複雑な例を示す。
利用者が伝統的カナダ・フランス語キーボードで
<samp><kbd>çé</kbd></samp>
（ <kbd ><kbd>¸</kbd><kbd>c</kbd><kbd>é</kbd></kbd> ）をタイプしたとする（ここでも，コンピュータ上では UTF-16 に符号化されて表示されるとする）。
ここでは、このコンピュータが UTF-16 の合成済形式（ fully composed form ）を利用しているとする。
<span lang="en">
A slightly more complex example is a user typing 'çé' on a traditional French-Canadian keyboard, which the computer again encodes in UTF-16 and displays. We assume that this particular computer uses a fully composed form of UTF-16.
</span></p>

<table>
<caption>
ダイアクリティカルマークを伴う Latin の例
<span class="summary">（カナダ・フランス語キーボードで
<samp><kbd>çé</kbd></samp>
がタイプされたときのキーストローク, 入力文字, 符号化された文字, その表示の一覧）</span>
<span lang="en">
Example: Latin with diacritics
(Table showing keystrokes, input characters, encoded characters and display for user typing çé on a French-Canadian keyboard)
</span></caption>
<tbody>
<tr class="kbd"><th>キーストローク<td>¸<td>c<td>é
<tr><th>入力文字<td colspan="2">ç<td>é
<tr><th>符号化文字（16 進バイト値）<td colspan="2">00E7<td>00E9
<tr><th>テキスト<td colspan="3"><samp><kbd>çé</kbd></samp>
</tbody></table>


<p>
注目される点が少しばかりある：
まず，利用者がセディーユ（ <kbd><kbd>¸</kbd></kbd> ）をタイプしても、キーボードドライバの状態が変化することを除いて，何も起きない
—
セディーユは
<dfn id="dead-key">デッドキー</dfn>
<!-- 
https://ja.wikipedia.org/wiki/%E3%83%87%E3%83%83%E3%83%89%E3%82%AD%E3%83%BC
-->
である。
続いて，ドライバにてキーストローク <kbd><kbd>c</kbd></kbd> が検知されると、１個の 16-bit
<a href="#def-CEF">符号単位</a>
として表現される完全な文字
<span class="qchar">ç</span>
がシステムに供され，<a href="#def-glyph">グリフ</a>
<span class="qchar">ç</span>
が表示される。
次に，利用者が専用の
<kbd><kbd>é</kbd></kbd>
キーを押し下げた場合も、２個のバイトで表現される１個の文字になる。
ほとんどのシステムでは，これを１個のグリフで表示するが、２個のグリフの組（基底 字とアクセントマーク）から同じ描画を得る手法もあり得る。
<span lang="en">
A few interesting things are happening here: when the user types the cedilla ('¸'), nothing happens except for a change of state of the keyboard driver; the cedilla is a dead key. When the driver gets the c keystroke, it provides a complete 'ç' character to the system, which represents it as a single 16-bit code unit and displays a 'ç' glyph. The user then presses the dedicated 'é' key, which results in, again, a character represented by two bytes. Most systems will display this as one glyph, but it is also possible to combine two glyphs (the base letter and the accent) to obtain the same rendering.
</span></p>

<p>
日本語の例：
利用者が
<dfn id="romaji-input-method">ローマ字入力メソッド</dfn>
を利用して “<samp><kbd>日本語</kbd></samp>” （ U+65E5, U+672C, U+8A9E ）とタイプし，コンピュータ上では、 UTF-16 に符号化されて表示されるとする。
<span lang="en">
On to a Japanese example: our user employs a romaji input method to type '日本語' (U+65E5, U+672C, U+8A9E), which the computer encodes in UTF-16 and displays.
</span></p>

<table>
<caption>
日本語の例
<span class="summary">（日本語 ローマ字入力メソッドで
<kbd><kbd>n</kbd><kbd>i</kbd><kbd>h</kbd><kbd>o</kbd><kbd>n</kbd><kbd>g</kbd><kbd>o</kbd></kbd>
がタイプされたときのキーストローク, 入力文字, 符号化された文字, その表示の一覧）</span>
<span lang="en">
Example: Japanese
(Table showing keystrokes, input characters, encoded characters and display for user typing nihongo in a Japanese Romaji input method)
</span></caption>
<tbody>
<tr class="kbd"><th>キーストローク<td colspan="3">n i h o n g o &lt;space&gt; &lt;return&gt;
<tr><th>入力文字<td>日<td>本<td>語
<tr><th>符号化文字（16 進バイト値）<td>65E5<td>672C<td>8A9E
<tr><th>表示（画像）<td colspan="3">
<a href="#nihongo"><img
	src="charmod/nihongo.gif"
	height="16" width="47"
	alt=
"“nihongo” と発音される３個の漢字, U+65E5, U+672C, U+8A9E 。
Three Kanji characters, U+65E5, U+672C, U+8A9E, pronounced 'nihongo'."
></a>

<tr><th class="trans-note">【表示（テキスト）】<td colspan="3">
<samp><kbd>日本語</kbd></samp>

</tbody></table>

<p>
ここで注目される点は入力である：
利用者がタイプした Latin 文字は、まず（ここには示されないが）その場で仮名に変換される。
利用者は望むだけ <kbd><kbd>space</kbd></kbd> キーを押し下げて，それを変換する。
最終的に 利用者が <kbd><kbd>return</kbd></kbd> キーを押し下げたとき，その漢字の文字がアプリケーションに送信される。
この３文字は自明な形では生成されず，それまでには９回のキーストロークを要し、それが符号化され, 表示される。
<span lang="en">
The interesting aspect here is input: the user types Latin characters, which are converted on the fly to kana (not shown here), and then to kanji when the user requests conversion by pressing &lt;space&gt;; the kanji characters are finally sent to the application when the user presses &lt;return&gt;. The user has to type a total of nine keystrokes before the three characters are produced, which are then encoded and displayed rather trivially.
</span></p>

<p id="_arabic-example_">
アラビア語 用字系の下でのペルシャ語では，また異なる様相を呈する：
<span lang="en">
A Persian example, using Arabic script, will show different phenomena:
</span></p>


<table id="arabicLigature-0">
<caption>
ペルシャ語の例
<span class="summary">（アラビア語キーボードでタイプされたときのキーストローク, 入力文字, 符号化された文字, その表示の一覧）</span>
<span lang="en">
Example: Persian
(Table showing keystrokes, input characters, encoded characters and display for user typing on an Arabic keyboard)
</span></caption>
<tbody>
<tr><th>キーストローク（画像）
<td>
<img src="charmod/fa-lam.gif" alt="ARABIC LETTER LAM" height="23" width="16">
<td>
<img src="charmod/fa-alif.gif" alt="ARABIC LETTER ALEF" height="23" width="7">
<td colspan="2">
<img src="charmod/fa-lamalif.gif" alt="Arabic ligature 'lam-alef'." height="23" width="16">
<td>
<img src="charmod/fa-yeh.gif" alt="ARABIC LETTER FARSI YEH" height="23" width="20">
<td>
<img src="charmod/fa-yeh.gif" alt="ARABIC LETTER FARSI YEH" height="23" width="20">


<tr class="kbd"><th class="trans-note">【キーストローク（テキスト）】
<td>&#x0644;<td>&#x0627;<td colspan="2">&#xFEFB;<td>&#x06CC;<td>&#x06CC;

<!-- 
<tr><th>入力文字<td>ل<td>ا<td>ل<td>ا<td>ی<td>ی
 -->

<tr><th>入力文字
<td>&#x0644;<td>&#x0627;<td>&#x0644;<td>&#x0627;<td>&#x06CC;<td>&#x06CC;
<tr><th>符号化文字（16 進バイト値）
<td>0644<td>0627<td>0644<td>0627<td>06CC<td>06CC
<tr><th>表示（画像）
<td colspan="6">
<a href="#arabicLigature"><img
	src="charmod/fa-laalaayee.gif"
	height="23" width="53"
	alt=
"表示出力は右から左の順に現れる：
２個の lam-alef 合字に続き，farsi yeh グリフと尾字形の farsi yeh グリフ。
The displayed output appears, from right to left, as: two lam-alef ligatures, and initial farsi yeh glyph attached to a final farsi yeh glyph."></a>

<tr><th class="trans-note">【表示（テキスト）】
<td colspan="6">
<samp><kbd>&#x0644;&#x0627;&#x0644;&#x0627;&#x06CC;&#x06CC;</kbd></samp>
<!-- 

لالایی

 -->

</tbody></table>

<p>
ここでは最初の２回のキーストロークそれぞれが１個の入力文字を経て１個の符号化された文字を生成するが、そのペアは１個のグリフとして表示される（
'<img src="charmod/fa-lamalif.gif" alt="Arabic ligature 'lam-alef'." height="23" width="16">', 合字 lam-alef）。
次のキーストロークは，一部のアラビア語用字系キーボードに備わる <kbd ><kbd >lam-alef</kbd></kbd> であり，１回のキーストロークで同じ２個の文字を生成し，前と同じように表示される。
この２番目の lam-alef は、表示の際には，最初のものの<em>左</em>に置かれる。
最後の２回のキーストロークは２個の同一の文字を生成するが，２個の異なるグリフで描画される（中字形（ medial form ）に後続して，その左に尾字形（ final form ））。
したがって、５回のキーストロークから６文字が生成され，４個のグリフが右から左へ配置される。
<span lang="en">
Here the first two keystrokes each produce an input character and an encoded character, but the pair is displayed as a single glyph ('', a lam-alef ligature). The next keystroke is a lam-alef, which some Arabic script keyboards have; it produces the same two characters which are displayed similarly, but this second lam-alef is placed to the left of the first one when displayed. The last two keystrokes produce two identical characters which are rendered by two different glyphs (a medial form followed to its left by a final form). We thus have 5 keystrokes producing 6 characters and 4 glyphs laid out right-to-left.
</span></p>

<p id="sec-CharExamplesA5">
最後にタミル語の例。
ISCII キーボードでタイプされ，新たな様相が見られる：
<span lang="en">
A final example in Tamil, typed with an ISCII keyboard, will illustrate some additional phenomena:
</span></p>

<table>
<caption>
タミル語の例
<span class="summary">（ Tamil ISCII キーボードでタイプされたときのキーストローク, 入力文字, 符号化された文字, その表示の一覧）</span>
<span lang="en">
Example: Tamil
(Table showing keystrokes, input characters, encoded characters and display for user typing on a Tamil ISCII keyboard)
</span></caption>
<tbody>
<tr><th>キーストローク（画像）<td>
<img src="charmod/ta-tm.gif" alt="TAMIL LETTER TTA" height="14" width="22">
<td>
<img src="charmod/a-tm.gif" alt="TAMIL  VOWEL SIGN AA" height="15" width="26">
<td>
<img src="charmod/nga-tm.gif" alt="TAMIL LETTER NGA" height="14" width="21">
<td>
<img src="charmod/virama-tm.gif" alt="TAMIL SIGN VIRAMA" height="16" width="17">
<td>
<img src="charmod/ka-tm.gif" alt="TAMIL LETTER KA" height="14" width="17">
<td>
<img src="charmod/o-tm.gif" alt="TAMIL VOWEL SIGN OO" height="17" width="40">

<tr><th>入力文字<td>&#x0B9F;<td>&#x0BBE;<td>&#x0B99;<td> &#x0BCD;<td>&#x0B95;<td>&#x0BCB;
<!-- ட／ா／ங／ ்／க／ோ -->
<tr><th>符号化文字（16 進バイト値）<td>0B9F<td>0BBE<td>0B99<td>0BCD<td>0B95<td>0BCB
<tr><th>表示（画像）<td colspan="6">
<a href="#tamil"><img
	src="charmod/tango.gif"
	height="17" width="82"
	alt="タミル語の字による 'Tango'"
></a>

<tr><th class="trans-note">【表示（テキスト）】<td colspan="6">
<samp><kbd>&#x0B9F;&#x0BBE;&#x0B99;&#x0BCD;&#x0B95;&#x0BCB;</kbd></samp>
<!-- டாங்கோ -->

</tbody></table>

<p>
ここでは入力自体は素直であるが、前掲のアクセントマーク付きの Latin の例とは逆に，ヴィラーマ ダイアクリティカルマーク
<span class="qchar">&#x0BCD;</span> (U+0BCD)
が，その適用対象の
<span class="qchar">&#x0B99;</span> (U+0B99)
の<em>後に</em>入力される。
また、最後の２個の文字の描画が特徴的である：
最後のもの
<span class="qchar">&#x0BCB;</span> (U+0BCB)
は明らかに２個のグリフからなり，最後の手前の文字
<span class="qchar">&#x0B95;</span> (U+0B95)
のグリフを<em>囲っている</em>。
<span lang="en">
Here input is straightforward, but note that contrary to the preceding accented Latin example, the virama diacritic '&#x0BCD;' (U+0BCD) is entered after the '&#x0B99;' (U+0B99) to which it applies. Rendering is interesting for the last two characters. The last one '&#x0BCB;' (U+0BCB) clearly consists of two glyphs which surround the glyph of the next to last character '&#x0B95;' (U+0B95).
</span></p>

</section>
<section id="sec-ExampleText">

<h2 title="Example text (Non-Normative)">C. テキストの例（参考）</h2>

<p>
次の一覧は、この文書で画像により例示された文字列または文字のテキストバージョンである。
これらは、テキストのカット＆ペーストの便宜のために用意されている。
<span lang="en">
The following are textual versions of strings or characters used in image-based examples in this document. They are provided here for the benefit of those who want to cut and paste the text for their own testing.
</span></p>

<table>
	<col span="1" />
	<col span="1" />
	<col span="1" style="min-width:13em;"/>
	<thead>
		<tr>
<th>節
<th title="Example:">例示画像
<th title="Text:">テキスト
	</thead>
	<tbody>
		<tr id="arabic-select">
<td title="3.3 Units of visual rendering"><a href="#sec-VisualRenderingUnits">3.3 節</a>

<td><img
	src="charmod/logSelMemory.gif"
	height="27" width="323"
	alt=
"２個のアラビア語の単語に年号が後続する文字列における論理順による文字の並び。
２番目の単語の途中から年号の途中までの範囲に入る文字が選択されたとするとき、論理選択モードでは，強調表示される範囲が１個の連続的な文字並びになる。
An example showing the logical order of characters in a string containing two Arabic words followed by a year number. In logical selection mode, the range of characters
selected by starting the selection in the middle of the second word and ending in the middle of the year number is depicted using highlighting. The highlighting covers a single block of contiguous characters."
>
<td><samp >&#x0639;&#x062F;&#x062F; &#x0645;&#x0627;&#x0631;&#x0633; &#x0661;&#x0669;&#x0669;&#x0668;</samp>
<!-- 

عدد مارس ١٩٩٨

-->
		

		<tr id="stumpOfTree">
<td><a href="#sec-Strings" title="Section: 6.1 String concepts">6.1 節</a>

<td><img
	src="charmod/chineseSurrogate.gif"
	height="25" width="24"
	alt=
"Ideographic supplementary character: Archaic Chinese character meaning &quot;the stump of a tree&quot; (still in current use in Cantonese)"
><img
	src="charmod/not_equal.gif" alt="NOT EQUAL TO" height="26" width="25"
><img
	src="charmod/Q.gif"
	alt="LATIN SMALL LETTER Q"
	height="21" width="14"
><img
	src="charmod/caron.gif"
	alt="COMBINING CARON"
	height="21" width="14"
>

<td><samp>&#x233B4;&#x2260;&#x0071;&#x030C;</samp>
<!-- 𣎴≠q̌ -->
		

		<tr id="nihongo">
<td><a href="#sec-CharExamples" title="Section: B Examples of Characters, Keystrokes and Glyphs">付録 B</a>

<td><img src="charmod/nihongo.gif" alt="Three Kanji characters, U+65E5, U+672C, U+8A9E, pronounced
'nihongo'." height="16" width="47">

<td><samp>日本語</samp>
		
		<tr id="arabicLigature">
<td><a href="#sec-CharExamples" title="Section: B Examples of Characters, Keystrokes and Glyphs">付録 B</a>

<td><img
	src="charmod/arabe.gif"
	height="26" width="42"
	alt=
"表示出力は右から左の順に現れる：
２個の lam-alef 合字に続き，farsi yeh グリフと尾字形の farsi yeh グリフ。
The displayed output appears, from right to left, as: two lam-alef ligatures, and initial ghayn glyph attached to a final ghayn glyph."
>

<td><samp >&#x0644;&#x0627;&#x0644;&#x0627;&#x063A;&#x063A;</samp>
<!-- 

لالاغغ

-->
		
		<tr>
<td colspan="3" >
<p style="width:600px;text-align:left;" class="trans-note">【
直上の画像／テキストはおそらく原文の誤り：
一部の文字の符号位置／示されている画像が
<a href="#arabicLigature-0" >本文のもの</a>（テキスト：
<samp >&#x0644;&#x0627;&#x0644;&#x0627;&#x06CC;&#x06CC;</samp>
）と食い違っている。
】</p>
<!-- 

لالایی

 -->
		

		<tr id="tamil">
<td><a href="#sec-CharExamples" title="Section: B Examples of Characters, Keystrokes and Glyphs">付録 B</a>

<td><img
	src="charmod/tango.gif"
	height="17" width="82"
	alt="タミル語の字による 'Tango'"
>

<td><samp >&#x0B9F;&#x0BBE;&#x0B99;&#x0BCD;&#x0B95;&#x0BCB;</samp>
<!-- டாங்கோ -->

		
	</tbody>
</table>

</section>
<section id="sec-Checklist">

<h2 title="List of conformance criteria (Non-Normative)">D. 適合性基準の一覧（参考）</h2>

<p>
以下に、文書順に並べられたこの仕様の適合性基準を挙げる。
仕様／実装／内容は、この仕様への適合性を検査する際に，この一覧を利用できる。
<span lang="en">
This is a list of the conformance criteria in this specification, in document order. This list can be used to check specifications, implementations, and content for conformance to this specification.
</span></p>

<p class="trans-note">【
このページの初期状態では、一覧は未表示：
次のボタンで表示の有無を
<input type="button" value="切り替える" onclick="expand_checklist()"/>
。
】</p>


<p>
その際には，次の事項が念頭に置かれるべきである：
<span lang="en">
When doing so, the following points should be kept in mind:
</span></p>

<ul>
	<li>
最初に文書全体をよく読んで意味を把握しておくこと。
この一覧は、テキストの本文の文脈の下で これらの適合性基準を読んだ上で，初めて、早見表の用をなす。
<span lang="en">
To ensure that you understand the meaning, read the whole document first. Use this list as a quick reference only after having first read the conformance criteria in context in the main body of the text.
</span></li>
	<li>
この一覧の適合性基準の意味が、それを囲んでいる，この文書の本文テキストを読んだ上でも明らかでないのであれば、
www-i18n-comments@w3.org （
<a href="https://lists.w3.org/Archives/Public/www-i18n-comments/">公開アーカイブ</a>
）宛までコメントを寄せることも考慮されたし。
<span lang="en">
If the meaning of a conformance criterion in this list is still unclear after referring back to the surrounding text in the main body of the document, consider sending a comment to www-i18n-comments@w3.org (publicly archived).
</span></li>
	<li>
すべての適合性基準が，すべての仕様／実装／内容に適用されるわけではない。
実際の適合性を検査する前に，適用し得るものかどうか検査されるべきである。
例えば C010 は，仕様に対してのみ適用される。
別の例として， C002 は 仕様／実装／内容 いずれにも適用されるが、それは 文字と表示されるテキストの単位との対応関係が扱われる場合に限られる。
<span lang="en">
Not all conformance criteria apply to all specifications, implementations, or content. Before checking for actual conformance, applicability should be checked. As an example, C010 only applies to specifications. As another example, C002 applies to specifications, implementations, and content, but only if it deals with mapping between characters and units of displayed text.
</span></li>
</ul>

<div id="_checklist_" ></div>

<!-- 
table id="req-checklist"
 -->

</section>
<section id="sec-Changes">

<h2 title="Changes since the Proposed Recommendation (Non-Normative)">E. 勧告案からの変更点（参考）</h2>

<ul>
	<li>
参照文献<!--1-->節の少数のリンクと参照が更新された。
<span lang="en">
A small number of links and references were updated in the references section.
</span></li>
	<li>
C076 の後の段落における明確化のための小さな編集上の修正：
“したがって、例えば，実際には iso-8859-1 に符号化されるものとは異なる
用字系／文字／記号
を表現する目的で
<del >iso-8859-1 で符号化されるレパートリ</del>
<ins >ISO Latin 1 文字集合の符号位置</ins>
を誤利用するようなフォントの構築などは禁止される。”
<span lang="en">
Minor editorial change to paragraph after C076 to clarify: "This prohibits, for example, the construction of fonts that misuse the repertoire encoded by iso-8859-1 to represent different scripts, characters, or symbols than what is actually encoded in iso-8859-1." changed to "This prohibits, for example, the construction of fonts that misuse the codepoints in the ISO Latin 1 character set to represent different scripts, characters, or symbols than those actually encoded in iso-8859-1.".
</span></li>

</ul>

</section>
<section id="sec-Acknowledgements">

<h2 title="Acknowledgements (Non-Normative)">F. 謝辞（参考）</h2>

<p>
（敬称略）
URI の節に対する重要な詳細を供された Tim Berners-Lee と James Clark に。
制作と編集の処理に多大な補助を与えてくれた Asmus Freytag , Addison Phillips, 早期の段階での Ian Jacobs に。
有益なコメントと示唆を供された，多くの方々と W3C の I18N WG と IG に。
<span lang="en">
Tim Berners-Lee and James Clark provided important details in the section on URIs. Asmus Freytag , Addison Phillips, and in early stages Ian Jacobs, provided significant help in the authoring and editing process. The W3C I18N WG and IG, as well as many others, provided many helpful comments and suggestions.
</span></p>

</section>

</main><!--  id="MAIN" -->


<!--% DATA SECTION -->
<div style="display:none;">

<span id="_optional_controls">
<input type="button" tabindex="1" accesskey="X" value="用語" 
	onclick="switch_words()" title="アクセスキー： X"/>：
<label><input type="radio" id="words0" name="words" onclick="switch_words(0)" checked="checked" />初期状態</label>
<label><input type="radio" id="words1" name="words" onclick="switch_words(1)" />英語寄り</label>
<label><input type="radio" id="words2" name="words" onclick="switch_words(2)" />英語主体</label>
<!-- 
<label><input type="radio" id="words3" name="words" onclick="switch_words(3)" />英語</label>
 -->

</span>

</div><!-- DATA SECTION -->


</body></html>

<!--
table summary → span.summary
border="1" cellpadding="5" cellspacing="0" → CSS
<span class="example-head">EXAMPLE:&nbsp;</span> → <b>例：</b>
<span class="note-head">NOTE: </span> → <b>注記：</b>
<span class="new-term">([^<]+)</span> → <dfn>$1</dfn>

-->