/* ======================================================================
The Bodington System Software License, Version 1.0
  
Copyright (c) 2001 The University of Leeds.  All rights reserved.
  
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1.  Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.

2.  Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3.  The end-user documentation included with the redistribution, if any,
must include the following acknowledgement:  "This product includes
software developed by the University of Leeds
(http://www.bodington.org/)."  Alternately, this acknowledgement may
appear in the software itself, if and wherever such third-party
acknowledgements normally appear.

4.  The names "Bodington", "Nathan Bodington", "Bodington System",
"Bodington Open Source Project", and "The University of Leeds" must not be
used to endorse or promote products derived from this software without
prior written permission. For written permission, please contact
d.gardner@leeds.ac.uk.

5.  The name "Bodington" may not appear in the name of products derived
from this software without prior written permission of the University of
Leeds.

THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO,  TITLE,  THE IMPLIED WARRANTIES 
OF QUALITY  AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO 
EVENT SHALL THE UNIVERSITY OF LEEDS OR ITS CONTRIBUTORS BE LIABLE FOR 
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
POSSIBILITY OF SUCH DAMAGE.
=========================================================

This software was originally created by the University of Leeds and may contain voluntary 
contributions from others.  For more information on the Bodington Open Source Project, please 
see http://bodington.org/

====================================================================== */
package org.bodington.util;

import java.util.*;
import java.io.*;
import java.io.UnsupportedEncodingException;

public class URLEncoder
{
    // this is the same as "special chars" as defined in RFC 1738
    // but without the plus symbol which can cause problems
    // and with the space, double quotes and square brackets
    // which are pretty safe to URL encode
    private static final String safe_chars = "$-_.!*'(), \"[]";
    
    private static final int caseDiff = ('a' - 'A');
    
    static BitSet dontNeedEncoding;
    static
    {
        dontNeedEncoding = new BitSet(256);
        int i;
        for (i = 'a'; i <= 'z'; i++)
        {
            dontNeedEncoding.set(i);
        }
        for (i = 'A'; i <= 'Z'; i++)
        {
            dontNeedEncoding.set(i);
        }
        for (i = '0'; i <= '9'; i++)
        {
            dontNeedEncoding.set(i);
        }
        dontNeedEncoding.set('-');
        dontNeedEncoding.set('_');
        dontNeedEncoding.set('.');
        dontNeedEncoding.set('*');
    }
    
    // table of pools of internal encoders
    private static Hashtable pool_table = new Hashtable();
    
    private static boolean isSafeChar( char c )
    {
        if ( c>='a' && c<='z' )
            return true;
        if ( c>='A' && c<='Z' )
            return true;
        if ( c>='0' && c<='9' )
            return true;
        return safe_chars.indexOf( c ) >= 0;
    }
    
    /**
     * Attempt to mung the code to make it easier to encode?
     * @param name The string to encode.
     * @return An encoded version of the string.
     */
    private static String encodeToBodURL_2_1( String name )
    {
        // url encoding can go wrong for unicode characters with
        // bit 7 set and in iPlanet for semi-colons and pluses.
        // Tomcat 4 disallows many ASCII characters in URLs
        // Another problem is that web browsers have no way to
        // know the character encoding that was used to do the
        // url encoding which they need to do if the user wants to
        // save the file to disk.  ASCII seems to be the usual
        // assumption although UTF-8 would be more useful.
        
        // it would be nice to just use urlencoding with UTF8
        // but tomcat 4 doesn't process urls with certain url
        // encoded bytes.  Browsers don't understand UTF8 either
        // so it would cause a problem for users saving downloaded
        // files. (Browser wouldn't know what file name to suggest)
        
        // UTF-7 might be another choice except that it uses + as
        // an escape character which is the very character we need
        // to avoid!
        
        // solution to URL encoding is to divide names into
        // those that even the most unfriendly web server can
        // cope with and those that might cause problems.
        // The easy names are URL encoded with ASCII char set
        // in the conventional way.  Normal web browsers will be
        // able to work out the original file name.  For others
        // the 'awkward' characters are coded as 4 digit hex
        // escaped with an underscore.
        // The awkward URLs are prefixed with an underscore so
        // Bodington can decode them even if the web browser can't.
        
        // Safe = doesn't start with underscore and only has
        // characters in the set; a-z, A-Z, 0-9 or any of "-_.()"
        // doesn't even need URL encoding
        
        int i, j;
        char c;
        String hex;
        boolean dodgy = false;
        
        
        try
        {
            // start by looking for dodgy characters
            for ( i=0; i<name.length(); i++ )
            {
                c = name.charAt( i );
                
                // if the name starts with an underscore we have to
                // count it as dodgy so we can tell dodgy URLs from normal
                // ones when we DEcode
                if ( i==0 && c=='_' )
                {
                    dodgy = true;
                    break;
                }
                
                // if alpha numeric move on to next char
                // must have a potentially dodgy character
                if ( isSafeChar( c ) )
                    continue;
                
                dodgy = true;
                break;
            }
            
            if ( !dodgy )
                return encodeNormally( name, "US-ASCII" );
            
            // convert url encoding unfriendly UTF16 characters to an escape
            // character, 0x1b followed by four digit hexadecimal
            // then URL encode the whole thing
            
            StringBuffer utf16escaped = new StringBuffer( name.length()*2 );
            utf16escaped.append( "_" );
            boolean in_escape=false;
            for ( i=0; i<name.length(); i++ )
            {
                c = name.charAt( i );
                
                // underscore is escaped as double underscore
                if ( c == '_' )
                    utf16escaped.append( "__" );
                // safe characters other than underscore go straight through
                else if ( isSafeChar( c ) )
                {
                    utf16escaped.append( (char)c );
                }
                else
                {
                    // unsafe characters and underscore are escaped with an underscore
                    utf16escaped.append( '_' );
                    // and are output as four character hex
                    hex = Integer.toHexString( c );
                    for ( j=hex.length(); j<4; j++ )
                        utf16escaped.append( '0' );
                    utf16escaped.append( hex );
                }
            }
            
            return encodeNormally( utf16escaped.toString(), "US-ASCII" );
        }
        catch ( UnsupportedEncodingException e )
        {
            // can never happen with US_ASCII?
            return null;
        }
    }
    
    /**
     * Mung the string to try and make decoding easier.
     * Any character in the character that is outside the 7bit set, ESC(0x1b), +(0x2b) or ;(0x3b) is
     * prefixed with ESC.
     * @param name The string to encode.
     * @return The encoded string.
     */
    private static String encodeToBodURL_2_0( String name )
    {
        try
        {
            // convert url encoding unfriendly  UTF16 characters to an escape
            // character, 0x1b followed by four digit hexadecimal
            // then URL encode the whole thing
            StringBuffer utf16escaped = new StringBuffer( name.length() );
            int c, i, j;
            String hex;
            
            for ( i=0; i<name.length(); i++ )
            {
                c = name.charAt( i );
                
                // url encoding can go wrong for unicode characters with
                // bit 7 set and in iPlanet for semi-colons and pluses.
                // Also the escape code needs to be escaped.
                if ( c >= 0x80 || c == 0x3b || c == 0x2b || c == 0x1b )
                {
                    hex = Integer.toHexString( c );
                    utf16escaped.append( '\u001b' );
                    for ( j=hex.length(); j<4; j++ )
                        utf16escaped.append( '0' );
                    utf16escaped.append( hex );
                }
                else
                    utf16escaped.append( (char)c );
            }
            
            return encodeNormally( utf16escaped.toString(), "US-ASCII" );
        }
        catch ( UnsupportedEncodingException e )
        {
            // can never happen with US_ASCII?
            return null;
        }
    }
    
    /**
     * Translates a string into <code>x-www-form-urlencoded</code> format.
     * @param s <code>String</code> to be translated.
     * @param encoding How we translate the string:
     *               <ul>
     *               <li>bodington_underscore - use the Bodington 2.1 encoding.</li>
     *               <li>bodington_escape - use the Bodington 2.0 encoding.</li>
     *               </ul>
     *               Any other encoding is passed on the the normall encoding.
     * @return the translated <code>String</code>.
     */
    public static String encode( String s, String encoding )
    throws UnsupportedEncodingException
    {
        String url;
        if ( "bodington_underscore".equalsIgnoreCase( encoding ) )
            url = encodeToBodURL_2_1( s );
        else if ( "bodington_escape".equalsIgnoreCase( encoding ) )
            url = encodeToBodURL_2_0( s );
        else
            url = encodeNormally( s, encoding );
        
        //if ( url.indexOf( '+' ) < 0 )
            return url;
        
        /*
        // standard URL encoding replaces space with + but the plus can
        // cause problems with some web servers (eg iPlanet) that fail
        // to convert them back to space when they url decode
        // It should save a lot of trouble to encode space as %20 instead
        // which is perfectly legal in a URL.
        int i;
        char c;
        StringBuffer no_plus_url = new StringBuffer( url.length()*2 );
        for ( i=0; i<url.length(); i++ )
        {
            c = url.charAt( i );
            if ( c == '+' )
                no_plus_url.append( "%20" );
            else
                no_plus_url.append( c );
        }
        return no_plus_url.toString();
        */
    }
    
    
    private static String encodeNormally( String s, String enc )
    throws UnsupportedEncodingException
    {
        URLEncoder ienc = getEncoder( enc );
        String out = ienc.internalEncode( s );
        recycleEncoder( ienc );
        return out;
    }
    
    private static URLEncoder getEncoder( String enc )
    {
        synchronized ( pool_table )
        {
            Vector pool = (Vector)pool_table.get( enc );
            if ( pool == null )
            {
                pool = new Vector();
                pool_table.put( enc, pool );
            }
            
            int size = pool.size();
            if ( size > 0 )
            {
                URLEncoder ienc = (URLEncoder)pool.elementAt( size-1 );
                pool.remove( size-1 );
                return ienc;
            }
            else
            {
                return new URLEncoder( enc );
            }
        }
    }
    
    private static void recycleEncoder( URLEncoder ienc )
    {
        synchronized ( pool_table )
        {
            Vector pool = (Vector)pool_table.get( ienc.getEncoding() );
            if ( pool == null )
            {
                pool = new Vector();
                pool_table.put( ienc.getEncoding(), pool );
            }
            pool.addElement( ienc );
        }
    }
    
    
    
    public static void main( String[] params )
    {
        try
        {
            DataInputStream din = new DataInputStream( System.in );
            String test, enc1, enc2;
            String encoding = "utf-8";
            
            for ( char ch = 0; ch < 0xffff; ch++ )
            {
             if ( ch == 0xd800 )
                 ch = 0xe800;
                
                test = "abc" + ch + "def";
                enc1 = encode( test, encoding );
                enc2 = java.net.URLEncoder.encode(  test, encoding );
                if ( !enc1.equals( enc2 ) )
                {
                    System.out.println();
                    System.out.println( enc1 );
                    System.out.println( enc2 );
                }
                
            }
            
        }
        catch ( Exception e )
        {
            e.printStackTrace();
        }
    }
    
    
    
    
    private String encoding;
    private StringBuffer output=null;
    private SpecialByteArrayOutputStream buffer=null;
    private OutputStreamWriter writer=null;
    
    public URLEncoder( String encoding )
    {
        this.encoding = encoding;
    }
    
    private void reset()
    throws UnsupportedEncodingException
    {
        if ( output == null )
        {
            output = new StringBuffer( 10 );
            buffer = new SpecialByteArrayOutputStream( 10 );
            writer = new OutputStreamWriter( buffer, encoding );
        }
        else
        {
            output.setLength( 0 );
            buffer.reset();
        }
    }
    
    private String getEncoding()
    {
        return encoding;
    }
    
    private String internalEncode1( String s )
    throws UnsupportedEncodingException
    {
        reset();
        try
        {
            writer.write( s );
            writer.flush();
        }
        catch ( IOException ioex )
        {
            throw new UnsupportedEncodingException( "Unable to encode URL." );
        }
        
        int c;
        for ( int i=0; i< buffer.size(); i++ )
        {
            c = buffer.getByte( i );
            if ( c<=0 || !isSafeChar( (char)c ) )
            {
                output.append( "%" );
                char ch = Character.forDigit((c >> 4) & 0xF, 16);
                // converting to use uppercase letter as part of
                // the hex value if ch is a letter.
                if (Character.isLetter(ch))
                {
                    ch -= caseDiff;
                }
                output.append(ch);
                ch = Character.forDigit(c & 0xF, 16);
                if (Character.isLetter(ch))
                {
                    ch -= caseDiff;
                }
                output.append(ch);
            }
            else
            {
                output.append( (char)c );
            }
        }
        
        return output.toString();
    }
    
    
    private String internalEncode( String s )
    throws UnsupportedEncodingException
    {
        reset();
        
        boolean needToChange = false;
        try
        {
            for (int i = 0; i < s.length(); i++)
            {
                int c = (int) s.charAt(i);
                //System.out.println("Examining character: " + c);
                if ( dontNeedEncoding.get( c ) )
                {
                    //System.out.println("Storing: " + c);
                    output.append((char)c);
                }
                else
                {
                    // convert to external encoding before hex conversion
                    buffer.reset();
                    writer.write(c);
                    /*
                     * If this character represents the start of a Unicode
                     * surrogate pair, then pass in two characters. It's not
                     * clear what should be done if a bytes reserved in the
                     * surrogate pairs range occurs outside of a legal
                     * surrogate pair. For now, just treat it as if it were
                     * any other character.
                     */
                    // TODO In Java 1.5 this is in the character class.
                    if (c >= 0xD800 && c <= 0xDBFF)
                    {
                        /*
                          System.out.println(Integer.toHexString(c)
                          + " is high surrogate");
                         */
                        if ( (i+1) < s.length())
                        {
                            int d = (int) s.charAt(i+1);
                            /*
                              System.out.println("\tExamining "
                              + Integer.toHexString(d));
                             */
                            if (d >= 0xDC00 && d <= 0xDFFF)
                            {
                                /*
                                  System.out.println("\t"
                                  + Integer.toHexString(d)
                                  + " is low surrogate");
                                 */
                                writer.write(d);
                                i++;
                            }
                        }
                    }
                    writer.flush();
                    
                    for ( int j = 0; j < buffer.size(); j++ )
                    {
                        output.append('%');
                        char ch = Character.forDigit((buffer.getByte( j ) >> 4) & 0xF, 16);
                        // converting to use uppercase letter as part of
                        // the hex value if ch is a letter.
                        if (Character.isLetter(ch))
                        {
                            ch -= caseDiff;
                        }
                        output.append(ch);
                        ch = Character.forDigit(buffer.getByte( j ) & 0xF, 16);
                        if (Character.isLetter(ch))
                        {
                            ch -= caseDiff;
                        }
                        output.append(ch);
                    }
                    needToChange = true;
                }
            }
        }
        catch( IOException e )
        {
            throw new UnsupportedEncodingException( "Unable to encode URL." );
        }
        
        return (needToChange? output.toString() : s);
    }
    
    
    
    private class SpecialByteArrayOutputStream extends ByteArrayOutputStream
    {
        public SpecialByteArrayOutputStream( int initsize )
        {
            super( initsize );
        }
        
        public byte getByte( int n )
        {
            return buf[n];
        }
    }
    
    
    
}
