/* ======================================================================
   Parts Copyright 2006 University of Leeds, Oxford University, University of the Highlands and Islands.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

====================================================================== */

package org.bodington.util;

import java.util.*;
import java.io.*;
import java.io.UnsupportedEncodingException;

public class URLEncoder
{
    // this is the same as "special chars" as defined in RFC 1738
    // but without the plus symbol which can cause problems
    // and with the space, double quotes and square brackets
    // which are pretty safe to URL encode
    private static final String safe_chars = "$-_.!*'(), \"[]";
    
    private static final int caseDiff = ('a' - 'A');
    
    static BitSet dontNeedEncoding;
    static
    {
        dontNeedEncoding = new BitSet(256);
        int i;
        for (i = 'a'; i <= 'z'; i++)
        {
            dontNeedEncoding.set(i);
        }
        for (i = 'A'; i <= 'Z'; i++)
        {
            dontNeedEncoding.set(i);
        }
        for (i = '0'; i <= '9'; i++)
        {
            dontNeedEncoding.set(i);
        }
        dontNeedEncoding.set('-');
        dontNeedEncoding.set('_');
        dontNeedEncoding.set('.');
        dontNeedEncoding.set('*');
    }
    
    // table of pools of internal encoders
    private static Hashtable pool_table = new Hashtable();
    
    private static boolean isSafeChar( char c )
    {
        if ( c>='a' && c<='z' )
            return true;
        if ( c>='A' && c<='Z' )
            return true;
        if ( c>='0' && c<='9' )
            return true;
        return safe_chars.indexOf( c ) >= 0;
    }
    
    /**
     * Attempt to mung the code to make it easier to encode?
     * @param name The string to encode.
     * @return An encoded version of the string.
     */
    private static String encodeToBodURL_2_1( String name )
    {
        // url encoding can go wrong for unicode characters with
        // bit 7 set and in iPlanet for semi-colons and pluses.
        // Tomcat 4 disallows many ASCII characters in URLs
        // Another problem is that web browsers have no way to
        // know the character encoding that was used to do the
        // url encoding which they need to do if the user wants to
        // save the file to disk.  ASCII seems to be the usual
        // assumption although UTF-8 would be more useful.
        
        // it would be nice to just use urlencoding with UTF8
        // but tomcat 4 doesn't process urls with certain url
        // encoded bytes.  Browsers don't understand UTF8 either
        // so it would cause a problem for users saving downloaded
        // files. (Browser wouldn't know what file name to suggest)
        
        // UTF-7 might be another choice except that it uses + as
        // an escape character which is the very character we need
        // to avoid!
        
        // solution to URL encoding is to divide names into
        // those that even the most unfriendly web server can
        // cope with and those that might cause problems.
        // The easy names are URL encoded with ASCII char set
        // in the conventional way.  Normal web browsers will be
        // able to work out the original file name.  For others
        // the 'awkward' characters are coded as 4 digit hex
        // escaped with an underscore.
        // The awkward URLs are prefixed with an underscore so
        // Bodington can decode them even if the web browser can't.
        
        // Safe = doesn't start with underscore and only has
        // characters in the set; a-z, A-Z, 0-9 or any of "-_.()"
        // doesn't even need URL encoding
        
        int i, j;
        char c;
        String hex;
        boolean dodgy = false;
        
        
        try
        {
            // start by looking for dodgy characters
            for ( i=0; i<name.length(); i++ )
            {
                c = name.charAt( i );
                
                // if the name starts with an underscore we have to
                // count it as dodgy so we can tell dodgy URLs from normal
                // ones when we DEcode
                if ( i==0 && c=='_' )
                {
                    dodgy = true;
                    break;
                }
                
                // if alpha numeric move on to next char
                // must have a potentially dodgy character
                if ( isSafeChar( c ) )
                    continue;
                
                dodgy = true;
                break;
            }
            
            if ( !dodgy )
                return encodeNormally( name, "US-ASCII" );
            
            // convert url encoding unfriendly UTF16 characters to an escape
            // character, 0x1b followed by four digit hexadecimal
            // then URL encode the whole thing
            
            StringBuffer utf16escaped = new StringBuffer( name.length()*2 );
            utf16escaped.append( "_" );
            boolean in_escape=false;
            for ( i=0; i<name.length(); i++ )
            {
                c = name.charAt( i );
                
                // underscore is escaped as double underscore
                if ( c == '_' )
                    utf16escaped.append( "__" );
                // safe characters other than underscore go straight through
                else if ( isSafeChar( c ) )
                {
                    utf16escaped.append( (char)c );
                }
                else
                {
                    // unsafe characters and underscore are escaped with an underscore
                    utf16escaped.append( '_' );
                    // and are output as four character hex
                    hex = Integer.toHexString( c );
                    for ( j=hex.length(); j<4; j++ )
                        utf16escaped.append( '0' );
                    utf16escaped.append( hex );
                }
            }
            
            return encodeNormally( utf16escaped.toString(), "US-ASCII" );
        }
        catch ( UnsupportedEncodingException e )
        {
            // can never happen with US_ASCII?
            return null;
        }
    }
    
    /**
     * Mung the string to try and make decoding easier.
     * Any character in the character that is outside the 7bit set, ESC(0x1b), +(0x2b) or ;(0x3b) is
     * prefixed with ESC.
     * @param name The string to encode.
     * @return The encoded string.
     */
    private static String encodeToBodURL_2_0( String name )
    {
        try
        {
            // convert url encoding unfriendly  UTF16 characters to an escape
            // character, 0x1b followed by four digit hexadecimal
            // then URL encode the whole thing
            StringBuffer utf16escaped = new StringBuffer( name.length() );
            int c, i, j;
            String hex;
            
            for ( i=0; i<name.length(); i++ )
            {
                c = name.charAt( i );
                
                // url encoding can go wrong for unicode characters with
                // bit 7 set and in iPlanet for semi-colons and pluses.
                // Also the escape code needs to be escaped.
                if ( c >= 0x80 || c == 0x3b || c == 0x2b || c == 0x1b )
                {
                    hex = Integer.toHexString( c );
                    utf16escaped.append( '\u001b' );
                    for ( j=hex.length(); j<4; j++ )
                        utf16escaped.append( '0' );
                    utf16escaped.append( hex );
                }
                else
                    utf16escaped.append( (char)c );
            }
            
            return encodeNormally( utf16escaped.toString(), "US-ASCII" );
        }
        catch ( UnsupportedEncodingException e )
        {
            // can never happen with US_ASCII?
            return null;
        }
    }
    
    /**
     * Translates a string into <code>x-www-form-urlencoded</code> format.
     * @param s <code>String</code> to be translated.
     * @param encoding How we translate the string:
     *               <ul>
     *               <li>bodington_underscore - use the Bodington 2.1 encoding.</li>
     *               <li>bodington_escape - use the Bodington 2.0 encoding.</li>
     *               </ul>
     *               Any other encoding is passed on the the normall encoding.
     * @return the translated <code>String</code>.
     */
    public static String encode( String s, String encoding )
    throws UnsupportedEncodingException
    {
        String url;
        if ( "bodington_underscore".equalsIgnoreCase( encoding ) )
            url = encodeToBodURL_2_1( s );
        else if ( "bodington_escape".equalsIgnoreCase( encoding ) )
            url = encodeToBodURL_2_0( s );
        else
            url = encodeNormally( s, encoding );
        
        //if ( url.indexOf( '+' ) < 0 )
            return url;
        
        /*
        // standard URL encoding replaces space with + but the plus can
        // cause problems with some web servers (eg iPlanet) that fail
        // to convert them back to space when they url decode
        // It should save a lot of trouble to encode space as %20 instead
        // which is perfectly legal in a URL.
        int i;
        char c;
        StringBuffer no_plus_url = new StringBuffer( url.length()*2 );
        for ( i=0; i<url.length(); i++ )
        {
            c = url.charAt( i );
            if ( c == '+' )
                no_plus_url.append( "%20" );
            else
                no_plus_url.append( c );
        }
        return no_plus_url.toString();
        */
    }
    
    
    private static String encodeNormally( String s, String enc )
    throws UnsupportedEncodingException
    {
        URLEncoder ienc = getEncoder( enc );
        String out = ienc.internalEncode( s );
        recycleEncoder( ienc );
        return out;
    }
    
    private static URLEncoder getEncoder( String enc )
    {
        synchronized ( pool_table )
        {
            Vector pool = (Vector)pool_table.get( enc );
            if ( pool == null )
            {
                pool = new Vector();
                pool_table.put( enc, pool );
            }
            
            int size = pool.size();
            if ( size > 0 )
            {
                URLEncoder ienc = (URLEncoder)pool.elementAt( size-1 );
                pool.remove( size-1 );
                return ienc;
            }
            else
            {
                return new URLEncoder( enc );
            }
        }
    }
    
    private static void recycleEncoder( URLEncoder ienc )
    {
        synchronized ( pool_table )
        {
            Vector pool = (Vector)pool_table.get( ienc.getEncoding() );
            if ( pool == null )
            {
                pool = new Vector();
                pool_table.put( ienc.getEncoding(), pool );
            }
            pool.addElement( ienc );
        }
    }
    
    
    
    public static void main( String[] params )
    {
        try
        {
            DataInputStream din = new DataInputStream( System.in );
            String test, enc1, enc2;
            String encoding = "utf-8";
            
            for ( char ch = 0; ch < 0xffff; ch++ )
            {
             if ( ch == 0xd800 )
                 ch = 0xe800;
                
                test = "abc" + ch + "def";
                enc1 = encode( test, encoding );
                enc2 = java.net.URLEncoder.encode(  test, encoding );
                if ( !enc1.equals( enc2 ) )
                {
                    System.out.println();
                    System.out.println( enc1 );
                    System.out.println( enc2 );
                }
                
            }
            
        }
        catch ( Exception e )
        {
            e.printStackTrace();
        }
    }
    
    
    
    
    private String encoding;
    private StringBuffer output=null;
    private SpecialByteArrayOutputStream buffer=null;
    private OutputStreamWriter writer=null;
    
    public URLEncoder( String encoding )
    {
        this.encoding = encoding;
    }
    
    private void reset()
    throws UnsupportedEncodingException
    {
        if ( output == null )
        {
            output = new StringBuffer( 10 );
            buffer = new SpecialByteArrayOutputStream( 10 );
            writer = new OutputStreamWriter( buffer, encoding );
        }
        else
        {
            output.setLength( 0 );
            buffer.reset();
        }
    }
    
    private String getEncoding()
    {
        return encoding;
    }
    
    private String internalEncode1( String s )
    throws UnsupportedEncodingException
    {
        reset();
        try
        {
            writer.write( s );
            writer.flush();
        }
        catch ( IOException ioex )
        {
            throw new UnsupportedEncodingException( "Unable to encode URL." );
        }
        
        int c;
        for ( int i=0; i< buffer.size(); i++ )
        {
            c = buffer.getByte( i );
            if ( c<=0 || !isSafeChar( (char)c ) )
            {
                output.append( "%" );
                char ch = Character.forDigit((c >> 4) & 0xF, 16);
                // converting to use uppercase letter as part of
                // the hex value if ch is a letter.
                if (Character.isLetter(ch))
                {
                    ch -= caseDiff;
                }
                output.append(ch);
                ch = Character.forDigit(c & 0xF, 16);
                if (Character.isLetter(ch))
                {
                    ch -= caseDiff;
                }
                output.append(ch);
            }
            else
            {
                output.append( (char)c );
            }
        }
        
        return output.toString();
    }
    
    
    private String internalEncode( String s )
    throws UnsupportedEncodingException
    {
        reset();
        
        boolean needToChange = false;
        try
        {
            for (int i = 0; i < s.length(); i++)
            {
                int c = (int) s.charAt(i);
                //System.out.println("Examining character: " + c);
                if ( dontNeedEncoding.get( c ) )
                {
                    //System.out.println("Storing: " + c);
                    output.append((char)c);
                }
                else
                {
                    // convert to external encoding before hex conversion
                    buffer.reset();
                    writer.write(c);
                    /*
                     * If this character represents the start of a Unicode
                     * surrogate pair, then pass in two characters. It's not
                     * clear what should be done if a bytes reserved in the
                     * surrogate pairs range occurs outside of a legal
                     * surrogate pair. For now, just treat it as if it were
                     * any other character.
                     */
                    // TODO In Java 1.5 this is in the character class.
                    if (c >= 0xD800 && c <= 0xDBFF)
                    {
                        /*
                          System.out.println(Integer.toHexString(c)
                          + " is high surrogate");
                         */
                        if ( (i+1) < s.length())
                        {
                            int d = (int) s.charAt(i+1);
                            /*
                              System.out.println("\tExamining "
                              + Integer.toHexString(d));
                             */
                            if (d >= 0xDC00 && d <= 0xDFFF)
                            {
                                /*
                                  System.out.println("\t"
                                  + Integer.toHexString(d)
                                  + " is low surrogate");
                                 */
                                writer.write(d);
                                i++;
                            }
                        }
                    }
                    writer.flush();
                    
                    for ( int j = 0; j < buffer.size(); j++ )
                    {
                        output.append('%');
                        char ch = Character.forDigit((buffer.getByte( j ) >> 4) & 0xF, 16);
                        // converting to use uppercase letter as part of
                        // the hex value if ch is a letter.
                        if (Character.isLetter(ch))
                        {
                            ch -= caseDiff;
                        }
                        output.append(ch);
                        ch = Character.forDigit(buffer.getByte( j ) & 0xF, 16);
                        if (Character.isLetter(ch))
                        {
                            ch -= caseDiff;
                        }
                        output.append(ch);
                    }
                    needToChange = true;
                }
            }
        }
        catch( IOException e )
        {
            throw new UnsupportedEncodingException( "Unable to encode URL." );
        }
        
        return (needToChange? output.toString() : s);
    }
    
    
    
    private class SpecialByteArrayOutputStream extends ByteArrayOutputStream
    {
        public SpecialByteArrayOutputStream( int initsize )
        {
            super( initsize );
        }
        
        public byte getByte( int n )
        {
            return buf[n];
        }
    }
    
    
    
}
