/* ======================================================================
   Parts Copyright 2006 University of Leeds, Oxford University, University of the Highlands and Islands.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

====================================================================== */

package org.bodington.xml;

import java.io.*;
import java.sql.*;
import java.text.*;
import java.util.*;

import org.apache.log4j.Logger;
import org.bodington.server.BuildingContext;

/**
 *
 * @author  jrm
 */
public class WordRepository
{
    private static Logger log = Logger.getLogger(WordRepository.class);
    
    Collator primary_collator, secondary_collator, tertiary_collator;

    Hashtable primary_table, secondary_table, tertiary_table, identical_table, reverse_table;
    Vector words_in_order;
    
    // All records must be loaded from the database before
    // we can search or add new records.  This boolean allows us
    // to check.
    boolean loaded=false;
    int next_free_id=1;
    
    /** Creates a new instance of WordRepository */
    public WordRepository( Locale locale )
    {
        primary_collator = Collator.getInstance( locale );
        secondary_collator = Collator.getInstance( locale );
        tertiary_collator = Collator.getInstance( locale );
        
        primary_collator.setStrength( Collator.PRIMARY );
        secondary_collator.setStrength( Collator.SECONDARY );
        tertiary_collator.setStrength( Collator.TERTIARY );

        primary_table = new Hashtable();
        secondary_table = new Hashtable();
        tertiary_table = new Hashtable();
        identical_table = new Hashtable();
        words_in_order = new Vector();
    }
    
    public boolean isLoaded()
    {
        return loaded;
    }
    
    public Word getWord( String source )
    {
        return (Word)identical_table.get( source );
    }
    
    private Word getSimilarWord( CollationKey key, int strength )
    {
        switch ( strength )
        {
            case Collator.TERTIARY:
                return (Word)tertiary_table.get( key );
            case Collator.SECONDARY:
                return (Word)secondary_table.get( key );
            case Collator.PRIMARY:
                return (Word)primary_table.get( key );
        }
        throw new IllegalArgumentException( "Unknown collation strength." );
    }

    /** Puts a potentially new word in.  Returns the new word object or null
     *  if there was no need to add it because it is already in there. The
     *  calling code should store the returned word in persistant store. If
     *  the return value is null then it is already stored.
     */
    public Word addWord( String source )
    {
        if ( !loaded )
            throw new IllegalStateException( "The word store hasn't finished loading." );
        
        Word wordi, word1, word2, word3;
        CollationKey key1, key2, key3;
        
        // Start by looking for words in the repository that match to different
        // strengths.  The identical strength table uses the string, not the 
        // collation key.
        wordi = (Word)identical_table.get( source );
        
        // if not null the string is identical to one already in the respository
        if ( wordi != null )
            return null;
        
        // This is a new word according to strongest comparison so needs its
        // own entry but it may match existing words with weaker comparison.
        
        key3 = tertiary_collator.getCollationKey( source );
        word3 = getSimilarWord( key3, Collator.TERTIARY );
        
        if ( word3!=null )
        {
            // new word is almost the same as an existing one...
            try { wordi=(Word)word3.clone(); } catch ( CloneNotSupportedException e ) {}
            // different source...
            wordi.setSource( source );
            // new 'identical' id
            wordi.setID( next_free_id, Collator.IDENTICAL );
            // other ids will be the same because tertiary equality implies
            // secondary and primary equality too
            
            // needs a new entry in one table;
            identical_table.put( source, wordi );
            words_in_order.addElement( wordi );
            next_free_id++;
            return wordi;
        }
        
        // new word and no 'tertiary' match to existing word
        // try secondary match...
        key2 = secondary_collator.getCollationKey( source );
        word2 = getSimilarWord( key2, Collator.SECONDARY );
        
        if ( word2!=null )
        {
            // new word is almost (2nd) the same as an existing one...
            try { wordi=(Word)word2.clone(); } catch ( CloneNotSupportedException e ) {}
            // different source...
            wordi.setSource( source );
            // new 'identical' id
            wordi.setID( next_free_id, Collator.IDENTICAL );
            // also a new tertiary id i.e. equals self
            wordi.setID( next_free_id, Collator.TERTIARY );
            // record tertiary collation key that didn't match earlier
            wordi.setCollationKey( key3, Collator.TERTIARY );
            // other ids will be the same because scondary equality implies
            // primary equality too
            
            // needs a new entry in two tables;
            identical_table.put( source, wordi );
            words_in_order.addElement( wordi );
            tertiary_table.put( key3, wordi );
            next_free_id++;
            return wordi;
        }

        // new word and no 'secondary' match to existing word
        // try primary match...
        key1 = primary_collator.getCollationKey( source );
        word1 = getSimilarWord( key1, Collator.PRIMARY );
        
        if ( word1!=null )
        {
            // new word is similar (1st) to an existing one...
            try { wordi=(Word)word1.clone(); } catch ( CloneNotSupportedException e ) {}
            // different source...
            wordi.setSource( source );
            // new 'identical' id
            wordi.setID( next_free_id, Collator.IDENTICAL );
            // also a new tertiary id i.e. equals self
            wordi.setID( next_free_id, Collator.TERTIARY );
            // record tertiary collation key that didn't match earlier
            wordi.setCollationKey( key3, Collator.TERTIARY );
            // also a new secondary id i.e. equals self
            wordi.setID( next_free_id, Collator.SECONDARY );
            // record secondary collation key that didn't match earlier
            wordi.setCollationKey( key2, Collator.SECONDARY );
            // primary id will be the same 
            
            // needs a new entry in three tables;
            identical_table.put( source, wordi );
            words_in_order.addElement( wordi );
            tertiary_table.put( key3, wordi );
            secondary_table.put( key2, wordi );
            next_free_id++;
            return wordi;
        }
        
        // this new word doesn't match anything else at any level so
        // create a new entry from scratch.....
        
        // all the IDs are self referential because this is the first example of
        // a word in the possible set of words that have equality at any given
        // strength of comparison.
        
        wordi = new Word( source, next_free_id, next_free_id, next_free_id,  next_free_id, key1, key2, key3 );
        // needs a new entry in all four tables;
        identical_table.put( source, wordi );
        words_in_order.addElement( wordi );
        tertiary_table.put( key3, wordi );
        secondary_table.put( key2, wordi );
        primary_table.put( key1, wordi );
        next_free_id++;
        return wordi;
    }
    
    /**
     * Gets the IDs of Words that match the string at a particular strength. 
     * (IDs returned are token IDs in database.) 
     * @param string String to match tokens to
     * @param strength Strength of matching
     * @return int array of token IDs
     */
    
    public int[] getMatchingWordIds( String string, int strength )
    {
        Vector words;
        int[] ids;
        
        words = getMatchingWords( string, strength );
        ids = new int[words.size()];
        
        for (int i=0; i<words.size(); i++)
        {
            ids[i] = ((Word)words.get(i)).getID( strength );   
        }
        
        return ids;
    }
    
    /**
     * Gets the Words that match the string at a particular strength.
     * @param string String to match tokens to
     * @param strength Strength of matching
     * @return int array of token IDs
     */
    public Vector getMatchingWords (String string, int strength )
    {
        if ( !loaded )
            throw new IllegalStateException( "The word store hasn't finished loading." );
        
        Word word;
        Vector words;
        
        switch ( strength )
        {
            case Collator.IDENTICAL:
                word = (Word)identical_table.get( string );
                words = new Vector();
                if (word != null) words.add( word );
                break;
            case Collator.PRIMARY:
                words = getMatchingWords( primary_table, primary_collator, string, true );
                break;
            case Collator.SECONDARY:
                words = getMatchingWords( secondary_table, secondary_collator, string, true );
                break;
            case Collator.TERTIARY:
                words = getMatchingWords( tertiary_table, tertiary_collator, string, false );
                break;
            default:
               throw new IllegalArgumentException( "Unknown collation strength." );
        }
//        if ( word == null ) return -1; // Don't think this error state was checked anywhere?
        
        return words;
    }
    
    /**
     * Gets Words from the Word repository for tokens that match the given string.
     * Tokens are retrieved from the database using an SQL wildcard search. These are 
     * then used to retrieve Words from the repository, using CollationKeys.<br />
     * 
     * @param wordTable Identifies which table of Words to use for collation
     * @param collator Identifies which Collator to use for generation of CollationKeys
     * @param string The term to find matches for
     * @param ignoreCase Whether search is case-sensitive or not
     * @return Vector of Words
     */
    private Vector getMatchingWords( Hashtable wordTable, Collator collator, String string, boolean ignoreCase )
    {
        String[] tokens;
        Word word;
        Vector words = new Vector();
        
        // still need to check whether there's a match via CollationKey,
        // which is the case when an SQL query doesn't retrieve a token that
        // differs in accent.
        word = (Word)wordTable.get( collator.getCollationKey( string ) );
        if (word != null) words.add( word);
        
        
        tokens = BuildingContext.getContext().getXMLRepository().getMatchingTokensUsingWildcard( string, ignoreCase );
        
        // number of words returned limited to total of 5
                
        for ( int i = 0; i < tokens.length && words.size() < 5; i++ )
        {
            word = (Word)wordTable.get( collator.getCollationKey( tokens[i] ) );
            if ( !words.contains( word )) words.add( word );
        } 
        
        return words;
    }
    
    public void dumpWord( PrintStream writer, Word word )
    {
        if ( word == null )
            return;

        writer.print( word.getID( Collator.IDENTICAL ) );
        writer.print( "\t" );
        writer.print( word.getID( Collator.TERTIARY ) );
        writer.print( "\t" );
        writer.print( word.getID( Collator.SECONDARY ) );
        writer.print( "\t" );
        writer.print( word.getID( Collator.PRIMARY ) );
        writer.print( "\t" );
        String source =  word.getSource();
        char c;
        for ( int j=0; j<source.length(); j++ )
        {
            c =  source.charAt( j );
            if ( c=='\n' )
                writer.print( "\\n" );
            else if ( c=='\r' )
                writer.print( "\\r" );
            else if ( c=='\t' )
                writer.print( "\\t" );
            else
                writer.print( c );
        }
        writer.println();
     }
    
    /** Stores a word that has been loaded from persistant store.
     *  To make things work words MUST be loaded in the same order that
     *  they were created.  
     */
    public Word loadWord( Word word )
    {
        if ( loaded )
            throw new IllegalStateException( "The word store has finished loading." );
        
        // build collation keys if not already done
        String source = word.getSource();
        if ( word.getCollationKey( Collator.PRIMARY ) == null )
            word.setCollationKey( primary_collator.getCollationKey( source ), Collator.PRIMARY );
        if ( word.getCollationKey( Collator.SECONDARY ) == null )
            word.setCollationKey( secondary_collator.getCollationKey( source ), Collator.SECONDARY );
        if ( word.getCollationKey( Collator.TERTIARY ) == null )
            word.setCollationKey( tertiary_collator.getCollationKey( source ), Collator.TERTIARY );
        
        if ( !identical_table.containsKey( source ) )
            identical_table.put( source, word );
        if ( !tertiary_table.containsKey( word.getCollationKey( Collator.TERTIARY ) ) )
            tertiary_table.put( word.getCollationKey( Collator.TERTIARY ), word );
        if ( !secondary_table.containsKey( word.getCollationKey( Collator.SECONDARY ) ) )
            secondary_table.put( word.getCollationKey( Collator.SECONDARY ), word );
        if ( !primary_table.containsKey( word.getCollationKey( Collator.PRIMARY ) ) )
            primary_table.put( word.getCollationKey( Collator.PRIMARY ), word );
        
        next_free_id = Math.max( word.getID( Collator.IDENTICAL ), next_free_id )+1;
        return word;
    }

    public Word getFirstWord()
    {
        return (Word)words_in_order.elementAt( 0 );
    }
    
    public Word getNextWord( Word current )
    {
        int n = words_in_order.indexOf( current );
        if ( n<0 )
            throw new IllegalArgumentException( "Can't get next word - specified word doesn't exist." );
        n++;
        if ( n >= words_in_order.size() )
            return null;
        return (Word)words_in_order.elementAt( n );
    }
    
    public void completeLoading()
    {
        loaded = true;
    }
    
    public static void main( String[] args )
    {
        int i;
        String[] list = { "th\u00edngy", "thi\u0301ngy", "Th\u00edngy", "thingy" };
        WordRepository rep = new WordRepository( Locale.ENGLISH );
        rep.completeLoading();
        Word word;
        for ( i=0; i< list.length; i++ )
        {
            word = rep.addWord( list[i] );
            rep.dumpWord( System.out, word );
        }
    }
}
