root/HTTP/Unidecode.lua

Revision 1464 (checked in by rsz, 2 weeks ago)

cleanup

Line 
1 --------------------------------------------------------------------------------
2 -- Title:               Unidecode.lua
3 -- Description:         Like a square peg in a round hole
4 -- Author:              Raphaël Szwarc http://alt.textdrive.com/lua/
5 -- Creation Date:       August 30, 2007
6 -- Legal:               Copyright (C) 2007 Raphaël Szwarc
7 --                      Under the terms of the MIT License
8 --                      http://www.opensource.org/licenses/mit-license.html
9 --------------------------------------------------------------------------------
10
11 -- import dependencies
12 local debug = require( 'debug' )
13 local io = require( 'io' )
14 local math = require( 'math' )
15 local string = require( 'string' )
16
17 local assert = assert
18 local getmetatable = getmetatable
19 local pairs = pairs
20 local package = package
21 local rawget = rawget
22 local rawset = rawset
23 local require = require
24 local setmetatable = setmetatable
25 local tostring = tostring
26
27 --------------------------------------------------------------------------------
28 -- Unidecode
29 -- as per Sean M. Burke's Text::Unidecode
30 -- http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
31 -- http://interglacial.com/~sburke/tpj/as_html/tpj22.html
32 --------------------------------------------------------------------------------
33
34 module( 'Unidecode' )
35 _VERSION = '1.0'
36
37 local self = setmetatable( _M, {} )
38 local meta = getmetatable( self )
39
40 --------------------------------------------------------------------------------
41 -- bits
42 -- as per Stephane Arnold's Validate Unicode String
43 -- http://lua-users.org/wiki/ValidateUnicodeString
44 --------------------------------------------------------------------------------
45
46 local bits = { max = 64 }
47
48 for i = 0, bits.max do
49     bits[i] = 2 ^ i
50     -- integer overflow detection
51     if tostring(bits[i]) == tostring(bits[i]-1) then
52         bits.max = i - 1
53         break
54     end
55 end
56
57 function bits:lshift(int, shift)
58     assert(shift ~= nil, "lshift(int,shift)")
59     assert(self[shift] ~= nil, "bad shift " .. shift)
60     local i
61     for i = 1, shift do
62         int= (int * 2) % self[self.max]
63     end
64     return int
65 end
66
67 function bits:rshift(int, shift)
68     assert(shift ~= nil, "lshift(int,shift)")
69     assert(self[shift] ~= nil, "bad shift " .. shift)
70     return math.floor(int / self[shift])
71 end
72
73 local charnum = {}
74 for i = 3, 7 do
75     charnum[i] = bits:lshift(1, i) - 2
76 end
77
78 function bits:read(str)
79     return string.byte(string.sub(str, 1, 1)), string.sub(str,2)
80 end
81
82 function bits:utf8(str)
83     local char, num, i, v
84     local result = 0
85     char, str = self:read(str)
86     -- a simple ASCII 7 bit character
87     if char < 128 then return char, str end
88
89     for i, v in pairs(charnum) do
90         if v == self:rshift(char, 8 - i) then
91             num = i
92             break
93         end
94     end
95     assert(num ~= nil, "invalid char : " .. char)
96     -- takes the first bits of the digits
97     local n = 8 - num
98     result = char % self[n]
99     for i = 1, num - 2 do
100         char, str = self:read(str)
101         assert(2 == self:rshift(char, 6), "invalid char " .. char ..
102             " at position " .. i)
103         result = self:lshift(result, 6) + char % self[6]
104     end
105     return result, str
106 end
107
108 --------------------------------------------------------------------------------
109 -- Utilities
110 --------------------------------------------------------------------------------
111
112 local data = setmetatable( {}, {} )
113 local metadata = getmetatable( data )
114
115 local function Path( aBlock )
116     local aSource = debug.getinfo( 1, 'S' ).source
117     local aSeparator = package.path:match( '(%p)%?%.' ) or '/'
118     local anIndex = aSource:len() - ( aSource:reverse():find( aSeparator, 1, true ) or aSource:len() )
119     local aPath = ( '%s%s%s%s.txt' ):format( aSource:sub( 2, anIndex + 1 ), self._NAME, aSeparator, aBlock )
120    
121     return aPath
122 end
123
124 function metadata:__index( aKey )
125     local Data = require( 'Data' )
126     local aBlock = ( '%02x' ):format( bits:rshift( bits:utf8( aKey ), 8 ) ):upper()
127     local aPath = Path( aBlock )
128     local aFile = io.open( aPath, 'rb' )
129    
130     if aFile then
131         local aData = Data[ aPath ]
132        
133         for aKey, aValue in pairs( aData ) do
134             if #aValue > 1 and aValue:byte( #aValue ) == 32 then
135                 aValue = aValue:gsub( '[%s%c]+$', '' ):lower()
136             end
137            
138             rawset( self, aKey, aValue )
139         end
140        
141         aFile:close()
142     end
143    
144     if not rawget( self, aKey ) then
145         rawset( self, aKey, '' )
146     end
147    
148     return rawget( self, aKey )
149 end
150
151 --------------------------------------------------------------------------------
152 -- Metamethods
153 --------------------------------------------------------------------------------
154
155 function meta:__call( aValue )
156     if aValue then
157         -- UTF-8 sequences
158         -- as per http://lua-users.org/wiki/LuaUnicode
159         return ( aValue:gsub( '([%z\1-\127\194-\244][\128-\191]*)', data ) )
160     end
161 end
162
163 function meta:__concat( aValue )
164     return tostring( self ) .. tostring( aValue )
165 end
166
167 function meta:__tostring()
168     return ( '%s/%s' ):format( self._NAME, self._VERSION )
169 end
Note: See TracBrowser for help on using the browser.