Changes: Detect encoding from the charset specified in HTML files

Revision as of 10:17, 20 December 2007

Please review this tip:

This tip was imported from vim.org and needs general review.
You might clean up comments or merge similar tips.
Add suitable categories so people can find the tip.
Please avoid the discussion page (use the Comments section below for notes).
If the tip contains good advice for current Vim, remove the {{review}} line.

Tip 1074 Printable Monobook Previous Next

created December 9, 2005 · complexity advanced · author Wu Yongwei · version 6.0

If one needs to edit files encoded in multiple legacy encodings, then the Vim fileencodings option cannot help much. Some hacks can be used to put the file encoding in the file (see VimTip911). However, in the case of HTML files, the encoding information is often in the HTML file already, especially for non-Latin1 Web pages, for example:

<meta http-equiv="Content-Type" content="text/html; charset=gb2312">

The following code can be put in vimrc to detect and use such an encoding specification:

if has('autocmd')
  function! ConvertHtmlEncoding(encoding)
    if a:encoding ==? 'gb2312'
      return 'cp936' " GB2312 imprecisely means CP936 in HTML
    elseif a:encoding ==? 'iso-8859-1'
      return 'latin1' " The canonical encoding name in Vim
    elseif a:encoding ==? 'utf8'
      return 'utf-8' " Other encoding aliases should follow here
    else
      return a:encoding
    endif
  endfunction

  function! DetectHtmlEncoding()
    if &filetype != 'html'
      return
    endif
    normal m`
    normal gg
    if search('\c<meta http-equiv=\("\?\)Content-Type\1 content="text/html; charset=[-A-Za-z0-9_]\+">') != 0
      let reg_bak=@"
      normal y$
      let charset=matchstr(@", 'text/html; charset=\zs[-A-Za-z0-9_]\+')
      let charset=ConvertHtmlEncoding(charset)
      normal ``
      let @"=reg_bak
      if &fileencodings == ''
        let auto_encodings=',' . &encoding . ','
      else
        let auto_encodings=',' . &fileencodings . ','
      endif
      if charset !=? &fileencoding &&
            \auto_encodings =~ ',' . &fileencoding . ','
        silent! exec 'e ++enc=' . charset
      endif
    else
      normal ``
    endif
  endfunction

  " Detect charset encoding in an HTML file
  au BufReadPost *.htm* nested call DetectHtmlEncoding()
endif

Please notice that the nested autocommand is used to ensure the syntax highlighting is OK and the remembered cursor position is still kept.

It is recommended to use set encoding=utf-8 in order to ensure successful encoding conversion.

@@ Line 1: / Line 1: @@
 {{review}}
+{{TipImported
-{{Tip
 |id=1074
+|previous=1073
-|title=Detect encoding from the charset specified in HTML files
+|next=1075
-|created=December 9, 2005 22:41
+|created=December 9, 2005
 |complexity=advanced
 |author=Wu Yongwei
 |version=6.0
 |rating=3/3
+}}
-|text=
-If one needs to edit files encoded in multiple legacy encodings, then the Vim fileencodings option cannot help much. Some hacks can be used to put the file encoding in the file (see Tip &#35;911). However, in the case of HTML files, the encoding information is often in the HTML file already, esp. for non-Latin1 Web pages, i.e.:
+If one needs to edit files encoded in multiple legacy encodings, then the Vim fileencodings option cannot help much. Some hacks can be used to put the file encoding in the file (see [[VimTip911]]). However, in the case of HTML files, the encoding information is often in the HTML file already, especially for non-Latin1 Web pages, for example:
+<pre>
+&lt;meta http-equiv="Content-Type" content="text/html; charset=gb2312"&gt;
+</pre>
+The following code can be put in vimrc to detect and use such an encoding specification:
+<pre>
-&lt;meta http-equiv="Content-Type" content="text/html; charset=gb2312"&gt;
+if has('autocmd')
+  function! ConvertHtmlEncoding(encoding)
+    if a:encoding ==? 'gb2312'
+      return 'cp936' " GB2312 imprecisely means CP936 in HTML
+    elseif a:encoding ==? 'iso-8859-1'
+      return 'latin1' " The canonical encoding name in Vim
+    elseif a:encoding ==? 'utf8'
+      return 'utf-8' " Other encoding aliases should follow here
+    else
+      return a:encoding
+    endif
+  endfunction
+  function! DetectHtmlEncoding()
+    if &amp;filetype != 'html'
+      return
+    endif
+    normal m`
+    normal gg
+    if search('\c&lt;meta http-equiv=\("\?\)Content-Type\1 content="text/html; charset=[-A-Za-z0-9_]\+"&gt;') != 0
+      let reg_bak=@"
+      normal y$
+      let charset=matchstr(@", 'text/html; charset=\zs[-A-Za-z0-9_]\+')
+      let charset=ConvertHtmlEncoding(charset)
+      normal ``
+      let @"=reg_bak
+      if &amp;fileencodings == ''
+        let auto_encodings=',' . &amp;encoding . ','
+      else
+        let auto_encodings=',' . &amp;fileencodings . ','
+      endif
+      if charset !=? &amp;fileencoding &amp;&amp;
+            \auto_encodings =~ ',' . &amp;fileencoding . ','
+        silent! exec 'e ++enc=' . charset
+      endif
+    else
+      normal ``
+    endif
+  endfunction
+  " Detect charset encoding in an HTML file
+  au BufReadPost *.htm* nested call DetectHtmlEncoding()
+endif
+</pre>
+Please notice that the nested autocommand is used to ensure the syntax highlighting is OK and the remembered cursor position is still kept.
-The following code can be put in _vimrc to detect and use such encoding specification:
+It is recommended to use <tt>set encoding=utf-8</tt> in order to ensure successful encoding conversion.
+==Comments==
----- code begins -----
-if has('autocmd')
- function! ConvertHtmlEncoding(encoding)
- if a:encoding ==? 'gb2312'
- return 'cp936' " GB2312 imprecisely means CP936 in HTML
- elseif a:encoding ==? 'iso-8859-1'
- return 'latin1' " The canonical encoding name in Vim
- elseif a:encoding ==? 'utf8'
- return 'utf-8' " Other encoding aliases should follow here
- else
- return a:encoding
- endif
- endfunction
- function! DetectHtmlEncoding()
- if &amp;filetype != 'html'
- return
- endif
- normal m`
- normal gg
- if search('\c&lt;meta http-equiv=\("\?\)Content-Type\1 content="text/html; charset=[-A-Za-z0-9_]\+"&gt;') != 0
- let reg_bak=@"
- normal y$
- let charset=matchstr(@", 'text/html; charset=\zs[-A-Za-z0-9_]\+')
- let charset=ConvertHtmlEncoding(charset)
- normal ``
- let @"=reg_bak
- if &amp;fileencodings == ''
- let auto_encodings=',' . &amp;encoding . ','
- else
- let auto_encodings=',' . &amp;fileencodings . ','
- endif
- if charset !=? &amp;fileencoding &amp;&amp;
- \auto_encodings =~ ',' . &amp;fileencoding . ','
- silent! exec 'e ++enc=' . charset
- endif
- else
- normal ``
- endif
- endfunction
- " Detect charset encoding in an HTML file
- au BufReadPost *.htm* nested call DetectHtmlEncoding()
----- code ends -----
-Please notice that the nested autocommand is used to ensure the syntax highlighting is OK and the remembered cursor position is still kept.
-It is recommended to use `set encoding=utf-8' in order to ensure successful encoding conversion.
-}}
-== Comments ==
-Remember the final 'endif'...
-wolcendo--AT--friko2.onet.pl
-, December 21, 2005 3:32
 ----
-<!-- parsed by vimtips.py in 0.682438  seconds-->