reference/mbstring/functions/mb-detect-encoding.xml
2c17cef6e71c3d85011319cde128cc4edf89a053
...
...
@@ -9,13 +9,25 @@
9
9
<refsect1 role="description">
10
10
&reftitle.description;
11
11
<methodsynopsis>
12
-
<type>string</type><methodname>mb_detect_encoding</methodname>
13
-
<methodparam><type>string</type><parameter>str</parameter></methodparam>
14
-
<methodparam choice="opt"><type>mixed</type><parameter>encoding_list</parameter><initializer>mb_detect_order()</initializer></methodparam>
15
-
<methodparam choice="opt"><type>bool</type><parameter>strict</parameter><initializer>false</initializer></methodparam>
12
+
<type class="union"><type>string</type><type>false</type></type><methodname>mb_detect_encoding</methodname>
13
+
<methodparam><type>string</type><parameter>string</parameter></methodparam>
14
+
<methodparam choice="opt"><type class="union"><type>array</type><type>string</type><type>null</type></type><parameter>encodings</parameter><initializer>&null;</initializer></methodparam>
15
+
<methodparam choice="opt"><type>bool</type><parameter>strict</parameter><initializer>&false;</initializer></methodparam>
16
16
</methodsynopsis>
17
17
<para>
18
-
Detects character encoding in <type>string</type> <parameter>str</parameter>.
18
+
Detects the most likely character encoding for <type>string</type> <parameter>string</parameter>
19
+
from an ordered list of candidates.
20
+
</para>
21
+
<para>
22
+
Automatic detection of the intended character encoding can never be entirely reliable;
23
+
without some additional information, it is similar to decoding an encrypted string
24
+
without the key. It is always preferable to use an indication of character encoding
25
+
stored or transmitted with the data, such as a "Content-Type" HTTP header.
26
+
</para>
27
+
<para>
28
+
This function is most useful with multibyte encodings, where not all sequences of
29
+
bytes form a valid string. If the input string contains such a sequence, that
30
+
encoding will be rejected, and the next encoding checked.
19
31
</para>
20
32
</refsect1>
21
33

...
...
@@ -24,24 +36,25 @@
24
36
<para>
25
37
<variablelist>
26
38
<varlistentry>
27
-
<term><parameter>str</parameter></term>
39
+
<term><parameter>string</parameter></term>
28
40
<listitem>
29
41
<para>
30
-
The <type>string</type> being detected.
42
+
The <type>string</type> being inspected.
31
43
</para>
32
44
</listitem>
33
45
</varlistentry>
34
46
<varlistentry>
35
-
<term><parameter>encoding_list</parameter></term>
47
+
<term><parameter>encodings</parameter></term>
36
48
<listitem>
37
49
<para>
38
-
<parameter>encoding_list</parameter> is list of character
39
-
encoding. Encoding order may be specified by array or comma
40
-
separated list string.
50
+
A list of character encodings to try, in order. The list may be specified as
51
+
an array of strings, or a single string separated by commas.
41
52
</para>
42
53
<para>
43
-
If <parameter>encoding_list</parameter> is omitted,
44
-
detect_order is used.
54
+
If <parameter>encodings</parameter> is omitted or &null;,
55
+
the current detect_order (set with the <link linkend="ini.mbstring.detect-order">
56
+
mbstring.detect_order</link> configuration option, or <function>mb_detect_order</function>
57
+
function) will be used.
45
58
</para>
46
59
</listitem>
47
60
</varlistentry>
...
...
@@ -49,9 +62,16 @@
49
62
<term><parameter>strict</parameter></term>
50
63
<listitem>
51
64
<para>
52
-
<parameter>strict</parameter> specifies whether to use
53
-
the strict encoding detection or not.
54
-
Default is &false;.
65
+
Controls the behaviour when <parameter>string</parameter>
66
+
is not valid in any of the listed <parameter>encodings</parameter>.
67
+
If <parameter>strict</parameter> is set to &false;, the closest matching
68
+
encoding will be returned; if <parameter>strict</parameter> is set to &true;,
69
+
&false; will be returned.
70
+
</para>
71
+
<para>
72
+
The default value for <parameter>strict</parameter> can be set
73
+
with the <link linkend="ini.mbstring.strict-detection">
74
+
mbstring.strict_detection</link> configuration option.
55
75
</para>
56
76
</listitem>
57
77
</varlistentry>
...
...
@@ -62,11 +82,37 @@
62
82
<refsect1 role="returnvalues">
63
83
&reftitle.returnvalues;
64
84
<para>
65
-
The detected character encoding or &false; if the encoding cannot be
66
-
detected from the given string.
85
+
The detected character encoding, or &false; if the string is not valid
86
+
in any of the listed encodings.
67
87
</para>
68
88
</refsect1>
69
89

90
+
<refsect1 role="changelog">
91
+
&reftitle.changelog;
92
+
<informaltable>
93
+
<tgroup cols="2">
94
+
<thead>
95
+
<row>
96
+
<entry>&Version;</entry>
97
+
<entry>&Description;</entry>
98
+
</row>
99
+
</thead>
100
+
<tbody>
101
+
<row>
102
+
<entry>8.2.0</entry>
103
+
<entry>
104
+
<function>mb_detect_encoding</function> will no longer return
105
+
the following non text encodings:
106
+
<literal>"Base64"</literal>, <literal>"QPrint"</literal>,
107
+
<literal>"UUencode"</literal>, <literal>"HTML entities"</literal>,
108
+
<literal>"7 bit"</literal> and <literal>"8 bit"</literal>.
109
+
</entry>
110
+
</row>
111
+
</tbody>
112
+
</tgroup>
113
+
</informaltable>
114
+
</refsect1>
115
+

70
116
<refsect1 role="examples">
71
117
&reftitle.examples;
72
118
<para>
...
...
@@ -75,23 +121,100 @@
75
121
<programlisting role="php">
76
122
<![CDATA[
77
123
<?php
78
-
/* Detect character encoding with current detect_order */
124
+
// Detect character encoding with current detect_order
79
125
echo mb_detect_encoding($str);
80
126

81
-
/* "auto" is expanded according to mbstring.language */
127
+
// "auto" is expanded according to mbstring.language
82
128
echo mb_detect_encoding($str, "auto");
83
129

84
-
/* Specify encoding_list character encoding by comma separated list */
130
+
// Specify "encodings" parameter by list separated by comma
85
131
echo mb_detect_encoding($str, "JIS, eucjp-win, sjis-win");
86
132

87
-
/* Use array to specify encoding_list */
88
-
$ary[] = "ASCII";
89
-
$ary[] = "JIS";
90
-
$ary[] = "EUC-JP";
91
-
echo mb_detect_encoding($str, $ary);
133
+
// Use array to specify "encodings" parameter
134
+
$encodings = [
135
+
"ASCII",
136
+
"JIS",
137
+
"EUC-JP"
138
+
];
139
+
echo mb_detect_encoding($str, $encodings);
140
+
?>
141
+
]]>
142
+
</programlisting>
143
+
</example>
144
+
</para>
145
+
<para>
146
+
<example>
147
+
<title>Effect of <parameter>strict</parameter> parameter</title>
148
+
<programlisting role="php">
149
+
<![CDATA[
150
+
<?php
151
+
// 'áéóú' encoded in ISO-8859-1
152
+
$str = "\xE1\xE9\xF3\xFA";
153
+

154
+
// The string is not valid ASCII or UTF-8, but UTF-8 is considered a closer match
155
+
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8'], false));
156
+
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8'], true));
157
+

158
+
// If a valid encoding is found, the strict parameter does not change the result
159
+
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8', 'ISO-8859-1'], false));
160
+
var_dump(mb_detect_encoding($str, ['ASCII', 'UTF-8', 'ISO-8859-1'], true));
161
+
?>
162
+
]]>
163
+
</programlisting>
164
+
&example.outputs;
165
+
<screen>
166
+
<![CDATA[
167
+
string(5) "UTF-8"
168
+
bool(false)
169
+
string(10) "ISO-8859-1"
170
+
string(10) "ISO-8859-1"
171
+
]]>
172
+
</screen>
173
+
</example>
174
+
</para>
175
+
<para>
176
+
In some cases, the same sequence of bytes may form a valid string in multiple
177
+
character encodings, and it is impossible to know which interpretation was
178
+
intended. For instance, among many others, the byte sequence "\xC4\xA2" could be:
179
+
</para>
180
+
<para>
181
+
<simplelist>
182
+
<member>
183
+
"Ä¢" (U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS followed by U+00A2 CENT SIGN)
184
+
encoded in any of ISO-8859-1, ISO-8859-15, or Windows-1252
185
+
</member>
186
+
<member>
187
+
"ФЂ" (U+0424 CYRILLIC CAPITAL LETTER EF followed by U+0402 CYRILLIC CAPITAL LETTER
188
+
DJE) encoded in ISO-8859-5
189
+
</member>
190
+
<member>
191
+
"Ģ" (U+0122 LATIN CAPITAL LETTER G WITH CEDILLA) encoded in UTF-8
192
+
</member>
193
+
</simplelist>
194
+
</para>
195
+
<para>
196
+
<example>
197
+
<title>Effect of order when multiple encodings match</title>
198
+
<programlisting role="php">
199
+
<![CDATA[
200
+
<?php
201
+
$str = "\xC4\xA2";
202
+

203
+
// The string is valid in all three encodings, so the first one listed will be returned
204
+
var_dump(mb_detect_encoding($str, ['UTF-8', 'ISO-8859-1', 'ISO-8859-5']));
205
+
var_dump(mb_detect_encoding($str, ['ISO-8859-1', 'ISO-8859-5', 'UTF-8']));
206
+
var_dump(mb_detect_encoding($str, ['ISO-8859-5', 'UTF-8', 'ISO-8859-1']));
92
207
?>
93
208
]]>
94
209
</programlisting>
210
+
&example.outputs;
211
+
<screen>
212
+
<![CDATA[
213
+
string(5) "UTF-8"
214
+
string(10) "ISO-8859-1"
215
+
string(10) "ISO-8859-5"
216
+
]]>
217
+
</screen>
95
218
</example>
96
219
</para>
97
220
</refsect1>
...
...
@@ -106,7 +229,6 @@ echo mb_detect_encoding($str, $ary);
106
229
</refsect1>
107
230

108
231
</refentry>
109
-

110
232
<!-- Keep this comment at the end of the file
111
233
Local variables:
112
234
mode: sgml
113
235