/*                                                                            *
 * Index2.c 1.0 2000 12 01                                                    *
 *                                                                            *
 * Copyright (c) 2000 Erwan Lemonnier and Eric Nordenstam                     *
 *                                                                            *
 * The code presented in this file has been tested with care but              *
 * is not guaranteed for any purpose. The writer does not offer               *
 * any warranties nor does he accept any liabilities with respect             *
 * to the code.                                                               */


/* The main purpose of this program is to sort a suffix array. Using this     *
 * suffix array it can also give the number of occurences of a character      *
 * string in the text. It will also give you the length of the longest        *
 * repeated substring.                                                        *
 *                                                                            *
 * The sorting is done by one of three algorithms  depending on the command   *
 * line parameters. The simplest choice is quicksort. This is slow. Next there*
 * is a recursive counting sort, which is considderably faster. Then there is *
 * a looping counting sort, which is a slight improvement over its recursive  *
 * ancestor. The by far most  efficient algorithm is the Manber-Myers         *
 * algorithm with an expected runtime of O(n log n).                          */

#include <stdio.h>
#include <malloc.h>
#include <time.h>

#define MAX(a,b) (a>b)?a:b




//function declarations
int   largestinf(int*,int,int);
void  qcksortint(int*,int,int);
int   firstsame(char*,int*,int,int,int);
int   longest(char*,int*,int,int,int);
int   boundedstrcmp(char*,char*,int);
void  countstr(int*,char*,int*,int,char*,int);
int   findstr(char*,int*,int,char*);
int   largestinf(int*,int,int);
void  qcksort2arrays(int*,int*,int,int);
int   countmetastr(int*,int,char*,char*);

//used for debuging: show the values in the suffix array of size l
void show(int* array, int l) {
  int i;
  printf("--start--\n");
  for(i=0; i<l;i++)
    printf("%d\n",array[i]);
  printf("\n");
}


//  largestinf

/*  DESCRIPTION:               (used by 'countmetastr()')                   *
 *    returns the index of the largest value in array which is <= to v      *
 *    l is the length of the array. array is sorted in increasing order     *
 *    returns -1 if no value is smaller than v                              */

int largestinf(int* array, int l, int v) {
  int a,b,c;

  if (array[0]>v)
    return -1;
  
  //initializing binomial search
  a = 0;
  c = l-1;
  b = (a+c)/2;

  //searching
  while (c-a>1) {
    if (array[b]>v)
      c = b;
    else
      a = b;
    
    b = (a+c)/2;
  }
  //it took time O(log n)

  if (array[c]<=v)
    return c;
  else
    return a;
}


//  countmetastr 

/*  DESCRIPTION:                                                                 *
 *    Count the number of substring in the text that match a substring           *
 *    containing '*' metacharacters. The strategy adopted is to use dynamic      *
 *    programming and build the result for the first substring in mstring,       *
 *    then to add the second substring and update the counts for it,             *
 *    and so one, while taking care of the special                               *
 *    cases of mstring begining and ending with '*'                              */

/*  PARAMETERS                                                                   *
 *    index:        the sorted index on 'file'                                   *
 *    file_length:  number of characters in file                                 *
 *    mstring:      the string containing '*' characters                         *
 *    file:         the file we are analysing                                    */

int countmetastr(int* arSuffix, int file_length, char* mstring, char* file) {
  int left;                    //number of characters still to be matched in 'mstring'
  int START_WITH_STAR = 0;     //=1 if mstring begins with '*'. else =0.
  int first;                   //=1 if the matching string is the first substring of mstring

  int* subindex1;     //index containing the sorted positions of the first suffix
                      //after the suffixes matching the previous matching substring 
                      //of mstring
  int* subindex2;     //same as subindex1 for the current substring

  int* array1;     //contains the number of suffixes matching mstring up to the previous
                   //substring and whose last string are the corresponding strings indexed
                   //in subindex1   
  int* array2;     //same as array2 for the current substring

  int pos[2];      //contains the result of 'countstr()'
  int la1;         //length of array1 (rem: length array1 = length subindex1)
  
  int i,j,ii,cnt,p,l,k,s;    //various variables
    
  //replace '*' by '\0' in mstring, set 'left' to length of mstring
  for(left = 0; mstring[left] != '\0'; left++) {
    if (mstring[left]=='*')
      mstring[left]='\0';
  }
  
  if (mstring[0]=='\0')      //if 'mstring' begins with '*'
    START_WITH_STAR = 1;

  first = 1;
  do {

    //skip the stars ('\0') until a substring to search for is found 
    //in 'mstring' or the end of 'mstring' is reached
    while (left && mstring[0]=='\0') {
      mstring++;
      left--;
    }

    //mstring points to the next substring to match
    //left is the number of characters left in the original 'mstring'
    if (left) {
      
      if (first) {
	
	/* COUNT OCCURENCES FOR THE FIRST SUBSTRING IN MSTRING	 */

	first = 0;

	//find an occurence of the substring mstring of the roiginal 'mstring' in arSuffix
	p =  findstr(file, arSuffix, file_length, mstring);

	//no occurence found
	if (p==-1)
	  return 0;

	//search in index the position pos[0] and pos[1] of the first 
	//and last string begining with mstring
	countstr(pos, file, arSuffix, file_length, mstring, p);
	i = pos[1]-pos[0]+1;

	//allocate subindex1 and array1
	subindex1 = (int*)malloc(i*sizeof(int));
	array1 = (int*)malloc(i*sizeof(int));
	
	if (array1==NULL || subindex1==NULL) {
	  printf("Error ! not enough memory to proceed search\n");
	  exit(0);
	}
	la1 = i;       //length array 1 = i

	//stores the position
	//move the suffix positions that are between pos[0] and pos[1] in arSuffix 
	//to subindex1 and add the length of mstring to them
	l = strlen(mstring);
	for(j=0; j<i; j++)
	  subindex1[j] = l+arSuffix[pos[0]+j];

	//initiate array1 with the number of string matching each mstring
	if (START_WITH_STAR) {
	  //if mstring starts with '*', all suffixes starting before subindex[j]-l are possible
	  for(j=0; j<la1; j++)
	    array1[j]=subindex1[j]+1-l;
	} else {
	  //only the occurence found should be counted
	  for(j=0; j<la1; j++)
	    array1[j]=1;
	}
	
	//sort subindex1 in increasing order, and reorganize array1 accordingly
	qcksort2arrays(subindex1, array1, 0, la1-1);

      } else {
	
	/*  COUNT OCCURENCES OF FURTHER SUBSTRINGS IN MSTRING    */

	//find the position of an occurence of the substring
	p = findstr(file, arSuffix, file_length, mstring);
	if (p==-1)
	  return 0;
	
	//finding further right and left occurences of this substring
	countstr(pos, file, arSuffix, file_length, mstring, p);
	i = pos[1]-pos[0]+1;
	subindex2 = (int*)malloc(i*sizeof(int));
	array2 = (int*)malloc(i*sizeof(int));
	l = strlen(mstring);
	
	//fill subindex2 with the pointers between pos[0] and pos[1]
	//that are after at least one already matched sequence of strings
	cnt = 0;
	for(j=0; j<i; j++) {
	  p = arSuffix[pos[0]+j];
	  
	  //search the position of the largest element of subindex1 <= to p
	  //ie the first previously found suffix that does not match with the surrent substring
	  //we got -1 if there is none
	  k = largestinf(subindex1,la1,p);

	  if (k!=-1) {
	    subindex2[cnt] = p+l;

	    //sum the numbers of matched found for all the possible suffixes matching up to
	    //the previous substring and still matching the current substring
	    s = 0;
	    for(ii=0; ii<=k; ii++)
	      s += array1[ii];

	    array2[cnt] = s;
	    cnt++;
	  } else
	    array2[cnt] = 0;
	}

	//sort subindex2 and array2 accordingly
	qcksort2arrays(subindex2, array2, 0, cnt-1);

	//free unused arrays
	free(subindex1);
	free(array1);
	subindex1 = subindex2;
	array1 = array2;
	la1 = cnt;
      }
      
      //move to the next substring in 'mstring'
      mstring += l;
      left -= l;
    }  
  } while (left>0);

  //test wether mstring ends with a '*'
  if ( *(mstring-1)=='\0') {
    
    //if there were only '*' in mstring
    //return n*(n+1)/2, where n is the length of the text
    if (first)
      return file_length*(file_length+1)/2;
  
    //otherwise, compute all the possible ends to the matching suffixes found
    for (i=0; i<la1; i++) {
      p = file_length-subindex1[i]+1;
    
      if (p)
	array1[i] = array1[i]*p;
    }
  }

  //now, sum the results in array1
  s = 0;
  for(i=0; i<la1; i++)
    s += array1[i];

  //free the last used arrays
  free(array1);
  free(subindex1);

  //and we have it ! :)
  return s;
}




//  qcksort2arrays

/*  DESCRIPTION:               (used by 'countmetastr()')                   *
 *    quicksort the array of integers 'array', and move the elements of     *
 *    'array2' accordingly. 'p' and 'r' are the indexes between which       *
 *    to sort the elements of array.                                        */

void qcksort2arrays(int* array, int* array2, int p, int r) {
  int x, y;
  int i, j;
  
  if (p<r) {
    x = array[p];
    i = p;
    j = r;

    while (i<j) {

      while ( i<j && array[i]<x)
	i++;

      while ( i<j && array[j]>=x)
	j--;

      if (i<j) {
	y = array[i];
	array[i] = array[j];
	array[j] = y;

	y = array2[i];
	array2[i] = array2[j];
	array2[j] = y;
      }
    }

    if (p<i) {
      qcksort2arrays(array, array2, p, i-1);
      qcksort2arrays(array, array2, i, r);
    } else         //array[p] was the smallest element
      qcksort2arrays(array, array2, p+1, r);
  }
}


//  qcksortint

/*  DESCRIPTION:               (used by 'countmetastr()')                   *
 *    quicksort the array of integers 'array'.                              *
 *    'p' and 'r' are the indexes between which                             */

void qcksortint(int* array, int p, int r) {
  int x, y;
  int i, j;
  
  if (p<r) {
    x = array[p];
    i = p;
    j = r;

    while (i<j) {

      while ( i<j && array[i]<x)
	i++;

      while ( i<j && array[j]>=x)
	j--;

      if (i<j) {
	y = array[i];
	array[i] = array[j];
	array[j] = y;
      }
    }

    if (p<i) {
      qcksortint(array, p, i-1);
      qcksortint(array, i, r);
    } else         //array[p] was the smallest element
      qcksortint(array,p+1,r);
  }
}




//  firstsame

/*  DESCRIPTION:               (used by 'longest()')                   *
 *    search the first string before 'b' whose dth character equals    *
 *    the dth character of 'b' and which is after 'a'.                 *
 *    It is used to search the longest repeated substring              */

int firstsame(char* file, int* arSuffix, int a, int b, int d) {
  int first, v, s;
  char c, cc;

  c = file[arSuffix[b]+d];
  first = b;
  
  if (first == a)
    return a;

  //search before b, multiplying the step by 2 each time, until
  //a string whose dth character is different from b's one
  s = 1;
  while ( c == file[ arSuffix[first]+d ] ) {
    s = 2*s;
    first -= s;
    if (first < a)
      break;    
  }
   
  //then, come back to b until a string with the same character is found
  s = s/2 + (s==1);
  v = -1;
  
  do {
    s = s/2 + (s==1);
    
    if (v>0)
      first -= s;
    else
      first += s;
      
    if (first < a)
      first = a;
 
    cc = file[ arSuffix[first]+d ];
  } while  ( (v=(cc-c)) != 0 );
  
  //we have it. Time: O(log n)
  return first;
}



//  longest

/*  DESCRIPTION:                                                        *
 *    returns the length of the longest repeated substring in the index *
 *    First, call 'longest(index, 0, file_length-1, 0)'. It then runs   *
 *    recursively. Time: O(l.n.logn) where n is the length of the text, *
 *    abd l the length of its longest repeated substring.               *
 *                                                                      *
 *    d:    the index of the character on which the strings between a   *
 *          and b need to be compared.                                  *
 *    a,b:  range of indexes between which to search for the longest    *
 *          substring.                                                  */

int longest(char* file, int* arSuffix, int a, int b, int d) {
  int p, l, ll;

  l = 0;

  while (a<b) {
   
    p = firstsame(file, arSuffix, a, b, d);

    if (b-p>0) {
      ll = 1+longest(file, arSuffix, p, b, d+1);
      l = MAX(1,l);
      l = MAX(l,ll);
    }
   
    b = p-1;
  }
  return l;
} 



//  boundedstrcmp

/*  DESCRIPTION:                                                        *
 *    compare the string st1 and st2 until their n-th character         *
 *    the result is the same as cmpstr with st1 and st2 truncated after *
 *    their n-th char.                                                  */

int boundedstrcmp(char* st1, char* st2, int n) {
  int p =0;

  while (p<n) {
    if (st1[p] != st2[p])
      return (int)(st2[p]-st1[p]);
    p++;
  }
  return 0;
}


//  countstr

/*  DESCRIPTION:                                                        *
 *    count the occurences of 'string' inside the string 'file' of      *
 *    length 'length'.                                                  *
 *    arSuffix : sorted suffix array for the string 'file'.             *
 *    i :        the index of one occurence of the string.              *
 *    returns:   an array of 2 pointers: the index to the 1st occurence *
 *               of the string and the index of the last occurence.     */

void countstr(int r[2], char* file, int* arSuffix, 
	      int file_length, char* string, int i) {
  int offset;
  int first, last;
  int s, v;
  int cnt = 0;
  
  int l = strlen(string);
  
  //search the last (on the left) occurence of the string
  last = i;
  if (last != file_length-1) {
    s = 1;
    
    //goes away exponentially fast from the position of the
    //known occurence in the index, until the string does not match
    while ( !boundedstrcmp(string, &file[arSuffix[last]], l) ) {
      s = 2*s;
      last += s;
      if (last >= file_length)
	break;     
    }
       
    s = s/2 + (s==1);
    v = 1;
    
    //now, use a binary search to find the last occurence of the string
    do {
      s = s/2 + (s==1);
      
      if (v>0)
	last -= s;
      else
	last += s;
      
      if (last >= file_length)
	last = file_length-1;
   
    } while  ( (v=(boundedstrcmp(string, &file[arSuffix[last]], l)))!=0 );
  }
  

  //search the first (on the right) occurence of the string
  //same algorithm as above
  first = i;
  if (first > 0) {
    
    s = 1;
    
    while ( !boundedstrcmp(string, &file[arSuffix[first]], l) ) {
      s = 2*s;
      first -= s;
      if (first < 0)
	break;     
    }
    
    s = s/2 + (s==1);
    v = -1;
    
    do {
      s = s/2 + (s==1);
      
      if (v>0)
	first -= s;
      else
	first += s;
      
      if (first < 0)
	first = 0;

    } while  ( (v=(boundedstrcmp(string, &file[arSuffix[first]], l)))!=0 );
  }

  //set the result in the array 'r'
  r[0] = first;
  r[1] = last;
}




//  findstr

/*  DESCRIPTION:                                                        *
 *    use dichotomy to find the 'string', inside the 'index' of length  *
 *    'length' return the position of the 1st occurence of the string,  *
 *    or -1 if the string was not found                                 */

int findstr(char* file, int* index, int length, char* string) {
  int a,b,c,d, pos;
  int a_old, c_old;
  unsigned char r;

  //initialize search boundaries
  a = 0;
  c = length-1;
  b = (a+c)/2;
  d = 0;
  
  do {

    if (string[d] == file[index[a]+d] == file[index[c]+d])
      d++;
    
    //optimization:
    //compare the strings 'string' and 'index[i]' from their d-th character
    pos = d;
    for(r = string[pos];
	r && ( r == file[index[b]+pos] );
	pos++, r=string[pos]
	);

    //update boundaries
    if (r) {
      if ( file[index[b]+pos] - r < 0 )
	a = b;
      else
	c = b;

      if (b==c && c-a==1)             //to avoid infinite loops
	return -1;
    }

    b = (a+c)/2+(c-a==1);   //when c-a = 1, add 1 to force b to increase
    
  } while(r && a!=c);
 
  if (!r) 
    return b;
  else
    return -1;
}



// lcsort

/*  DESCRIPTION                                                                   *
 *    optimized csort: no recursivity but a loop, no new array at each level,     *
 *    but 2 big ones from the start                                               *
 *    arSuffix : non sorted suffix array initialised with indexes to all suffixes *
 *               of file.                                                         *
 *    index2 :   a non initialized chunk of memory of same size as arSuffix       *
 *    length :   number of pointers to sort                                       */
//REM: it is assumed that 'int's are 32 bits long...

void lcsort(char* file, int* arSuffix, int* index2, 
	    int length, long int* array) {

  //for each iteration of the while loop
  int n_jobs;           // number of sorting needed
  int n_jobs2;          // idem but for the next level of iteration

  int digit;            // digit on which to sort
  int* jobs;            /* array describing a group of entries of index 
			   (first entry, last entry) that need to be sorted 
			   on there 'digit'-th digit */
  int* jobs2;           // jobs for next level of iteration

  //various variables
  int i, j, a, b, d1, d2;
  unsigned char pos;
  int* z;

  //jobs may contain up to length/2 entries, of 2 int each
  jobs  = (int*)malloc(length*sizeof(int));
  jobs2 = (int*)malloc(length*sizeof(int));
  if (jobs==NULL || jobs2==NULL) {
    printf("Error: not enough memory to sort index\n");
    exit(0);
  }

  //for the first iteration, we want to sort the whole index
  n_jobs2 = 0;
  n_jobs  = 1;
  digit   = 0;
  jobs[0] = 0;
  jobs[1] = length-1;
  
  do {
  
    for(j=0; j<n_jobs; j++) {
      a = jobs[2*j];
      b = jobs[2*j+1];
      
      for(i=0; i<256; i++)           //fill array with zeros
	array[i] = 0;

      for(i=a; i<=b; i++) {          //count occurences of chars    
	pos = file[arSuffix[i]+digit];
	array[pos]++;
      }
      
      for(i=1; i<256; i++)           //sum occurences of chars
	array[i] += array[i-1];
      
      //add new jobs to the list of jobs for next level of loop, ie jobs2
      d1=0;
      for(i=0; i<256; i++) {    
	d2 = array[i];
	if (d2-d1>1) {
	  jobs2[2*n_jobs2] = a+d1;
	  jobs2[2*n_jobs2+1] = a+d2-1;
	  n_jobs2++;
	}
	d1 = d2;
      }

      //sort index2
      //index2 contains the pointers of index, 
      //ordered following their p-th digit
      for(i=b; i>=a; i--) {
	pos = file[arSuffix[i]+digit];     /* position in array of the char at 
					      pos digit in string i of index;*/
	index2[a+array[pos]-1]=arSuffix[i];
	array[pos]--;
      }
      //now, update index based on index2
      for(i=a; i<=b; i++)
	arSuffix[i] = index2[i];
    }
    
    //set jobs parameters to jobs2 ones
    n_jobs = n_jobs2;
    n_jobs2 = 0;
    digit++;

    z = jobs;
    jobs = jobs2;
    jobs2 = z;
    
  } while (n_jobs>0);
  
  free(jobs);
  free(jobs2);
}


// lcsort

/*  DESCRIPTION                                                                   *
 *    a recursive counting sort used to sort the suffix array arSuffix.           *
 *    arSuffix : non sorted suffix array initialised with indexes to all suffixes *
 *               of file.                                                         *
 *    index2 :   a non initialized chunk of memory of same size as arSuffix       *
 *    length :   number of pointers to sort                                       */

void csort(char* file, int* arSuffix, int* index2, 
	   int p, int a, int b, long int* array) {
  int i;
  long int array2[256];          //array containing positions of chars in file
  long int d1, d2;               //positions in file
  char** z;
  unsigned char pos;             /*a char in file, casted as int when used 
				   as index for array*/

  for(i=0; i<256; i++)           //fill array with zeros
    array[i] = 0;

  for(i=a; i<=b; i++) {           //count occurences of chars    
    pos = file[arSuffix[i]+p];
    array[ pos ]++;
  }
	 
  for(i=1; i<256; i++)           //sum occurences of chars
    array[i] += array[i-1];

  for(i=0; i<256; i++)           //copy array to array2
    array2[i] = array[i];

  //index2 contains the pointers of index, ordered following their p-th digit
  for(i=b; i>=a; i--) {
    pos = file[arSuffix[i]+p];   /*position in array of the 
				   char at pos p in string i of index;*/
    index2[a+array[pos]-1]=arSuffix[i];
    array[pos]--;
  }

  //now, update index based on index2
  for(i=a; i<=b; i++)
    arSuffix[i] = index2[i];

  //all strings having the same p first digit are now sorted 
  //on their p+1-th digit
  d1=0;
  for(i=0; i<256; i++) {    
    d2 = array2[i];
    if (d2-d1>1)
      csort(file, arSuffix, index2, p+1, a+d1, a+d2-1, array);
    d1 = d2;
  }
}


//qcksortstr

/*  DESCRIPTION                                                     *
 *    an implementation of quicksort to sort a suffix array between *
 *    positions p and r                                             */

void qcksortstr(char* file, int* arSuffix, int p, int r) {
  char* x;
  int y;
  int i, j;
  
  if (p<r) {
    x = &file[arSuffix[p]];
    i = p;
    j = r;

    while (i<j) {

      while ( i<j && strcmp(&file[arSuffix[i]],x) < 0)
	i++;

      while ( i<j && strcmp(&file[arSuffix[j]],x) >= 0)
	j--;

      if (i<j) {
	y = arSuffix[i];
	arSuffix[i] = arSuffix[j];
	arSuffix[j] = y;
      }
    }

    if (p<i) {
      qcksortstr(file, arSuffix, p, i-1);
      qcksortstr(file, arSuffix, i, r);
    } else         //index[p] was the smallest element
      qcksortstr(file, arSuffix,p+1,r);
  }
}

/* Implementation of the Manber-Myers suffix sorting algorithm.               *
 * parameters:                                                                *
 *    file          char array containing the string to sort suffixes of      *
 *    arSuffix      int array that will contain the resulting array           *
 *    file_length   integer containing the length of the string               *
 *    prm           temporary array of int. Needs to be of length file_length *
 *    count         temporary array of int. Needs to be of length file_length *
 *    bucketStart   temporary array of char. Used as boolean array. Needs to  *
 *                  be of length file_length+1                                *
 *    bucket2Start  same as bucketStart                                       *
 *    bucket        array of long int with length 256. Used during initial    *
 *                  bucketing                                                 */
void manberMyers( char* file, int* arSuffix, int file_length, 
		  int* prm, int* count, char* bucketStart, 
		  char* bucket2Start, long int* bucket ) {
  int b, c, d, d2, e, f, i, h, lastBucketStart;
  char ch;

  //Phase one, bucketing on first character.
  //----------------------------------------

  //Initialise the buckets.
  for (c=0; c<256; c++) {
    bucket[c]=-1;
  }
  printf("  - Buckets initialized\n");

  //Figure out in which bucket every item goes
  for (c=0; c<file_length; c++) {
    ch = file[c];
    prm[c] = bucket[ch];
    bucket[ch] = c;
    bucketStart[c] = 0;
    bucket2Start[c]=0;
  }
  printf("  - Found which bucket every suffix goes\n");

  //Place each item in its bucket.
  c = 0;
  for (b=0; b<256; b++) {
    i = bucket[b];
    bucketStart[c] = 1;
    while ( i != -1 ) {
      arSuffix[c] = i;
      i = prm[i];
      c++;
    }
  }
  printf("  - Done initial bucketing\n");

  //Fix up the prm array.
  //Not necessary
  //for (i=0; i<file_length; i++) {
  //	prm[arSuffix[i]]=i;
  //	count[i]=0;
  //}


  bucketStart[file_length] = 1;

  // Phase 2: Bucket on h=2^n first characters.
  // ------------------------------------------

  for (h = 1; h < file_length; h*=2) {

    for (c=0; c<file_length;c++) {
      if (bucketStart[c]==1) {
	count[c]=0;
	lastBucketStart = c;
      }
      prm[arSuffix[c]]=lastBucketStart;
    }

    d = file_length-h;
    e = prm[d];
    prm[d] = e + count[e];
    count[e]++;
    bucket2Start[ prm[d] ] = 1;
	
    lastBucketStart=0;
    for (c=0; c<file_length; c++) {
      do {
	if (arSuffix[c]-h >= 0) {
	  count[prm[arSuffix[c]-h]]++;
	  prm[arSuffix[c]-h]+=count[prm[arSuffix[c]-h]]-1;
	  bucket2Start[prm[arSuffix[c]-h]]=1;
	}
	c++;
      }
      while (!bucketStart[c]);

      c=lastBucketStart;
      do {
	if (arSuffix[c]-h >= 0) {
	  d2=arSuffix[c]-h;
	  if (bucket2Start[prm[d2]]) {
	    f=prm[d2]+1;
	    while (bucket2Start[f] && !bucketStart[f]) {
	      bucket2Start[f]=0;
	      f++;
	    };
	  }
	}
	c++;
      }
      while (!bucketStart[c]);
      lastBucketStart=c;
      c--;
    }
    for(i=0; i<file_length; i++) {
      arSuffix[prm[i]]=i;
    }
    for (i=0; i<file_length; i++) {
      if (bucket2Start[i]) {
	bucketStart[i]=1;
      }
    }
  }
  printf("  - Done sorting\n");
}


int main(int argc, char** arg) {
  
  int i;
  long int file_length;   //length in bytes of the file

  int* arSuffix;          //the suffix array

  int* arTemp1;           // temporary arrays used when sorting the suffix array
  int* arTemp2;
  char* arTemp3;          // these two temporary arrays will only contain
  char* arTemp4;          // boolean values
  int* arCheck;           /* second suffix array, used when checking 
			     correctness of algorithms */
  char* file;             // string array to hold the content of the file
  int c;                  
  FILE* in;
  long int bucket[256];   /* array used by recursive counting sort 
			     and Manber-Myers */
  int r[2];
  time_t nTime;           //keep track of time in seconds

  //command line options
  int method  = 3;        //tells what method of suffix sorting to use.
  int verbose = 0;
  int count   = 0;
  int mcount  = 0;
  int lrecsub = 0;
  int check   = 0;
  char* cstring;
  char* mcstring;
  char* filename;
  char* starg;

  printf("\n---------------------------------------------------------\n");
  printf("Program for sorting a suffix array and searching in text.\n");
  printf("Written by Erwan Lemonnier and Eric Nordenstam for the\n");
  printf("course Advanced Algorithms in December 2000.\n");
  printf("---------------------------------------------------------\n\n");
  
  //test parameters
  if (argc==1) {
    printf("Build an index of a text file <filename> and provide some functions for\n");
    printf("searching substrings in this text\n");
    printf("Written by Erwan Lemonnier on 12/2000 for 'Advanced Algorithm', Homework 2\n\n");
    printf("usage: index [-mX] [-v] [-c \"string\"] [-mc \"string\"] [-l] <filename>\n\n");
    printf("       <filename>     : filename of the test file to analyse.\n");
    printf("                        index <filename> just build a sorted index of the text.\n");
    printf("       -m0            : sort index with quicksort.\n");
    printf("       -m1            : sort index with recursive counting sort.\n");
    printf("       -m2            : sort index with optimised looping counting sort.\n");
    printf("       -m3            : sort index with the Manber-Myers algorithm.\n\n");
    printf("       --check        : sort with looping counting sort and compare result\n");
    printf("       -v             : show the sorted index.\n");
    printf("       -c  \"string\"   : count the occurences of \"string\" in the text.\n");
    printf("       -mc \"string\"   : same as -c, but \"string\" may contain the meta character '*'.\n");
    printf("       -l             : count occurences of the longest repeated substring in text.\n\n");
    exit(0);
  }

  //parsing of parameters
  for(i=1; i<argc-1; i++) {
    starg = arg[i];

    if (!strcmp(starg,"-m0"))
      method = 0;

    if (!strcmp(starg,"-m1"))
      method = 1;   
   
    if (!strcmp(starg,"-m2"))
      method = 2;

    if (!strcmp(starg,"-m3"))
      method = 3;

    if (!strcmp(starg,"-v"))
      verbose = 1;
    
    if (!strcmp(starg,"--check")) {
      check = 1;
    }

    if (!strcmp(starg,"-c")) {
      count = 1;
      cstring = arg[i+1];
      i++;
    }

    if (!strcmp(starg,"-mc")) {
      mcount = 1;
      mcstring = arg[i+1];
      i++;
    }

    if (!strcmp(starg,"-l"))
      lrecsub = 1;
  }
    
  filename = arg[argc-1];

  //try opening file
  if (!(in = fopen(filename, "r"))) { 
    printf("can't find file %s\n", filename);
    exit(0);
  }

  //compute file length
  fseek(in, 0, SEEK_END);
  file_length = ftell(in);
  rewind(in);
 
  //load file in memory, as a long string
  printf("\n* Loading file into memory\n  - allocating %d bytes\n", file_length);
  file = (char*)malloc(file_length+1);
  if (file==NULL) {
    printf("  - Error: not enough memory to load file\n");
    exit(0);
  }
  fread(file,1 ,file_length, in);
  fclose(in);
  file[file_length] = '\0';      //add a second null char, for dlcsort

  //build index
  printf("\n* Allocating suffix array\n  - size %d bytes\n", file_length*sizeof(int));
  arSuffix = (int*)malloc(file_length*sizeof(int));

  if (arSuffix == NULL) {
    printf("  - Error: Could not allocate memory for suffix array");
    exit(0);
  }

  for(c = 0; c < file_length; c++)
    arSuffix[c] = c;

  //sort index

  nTime = -time(NULL);

  if (method==0) {
    printf("\n* Sorting index\n  - starting quicksort\n");
    qcksortstr(file, arSuffix, 0, file_length-1);
  }

  if (method==1) {
    printf("\n* Sorting index\n  - using recursive counting sort\n");
    printf("  - allocating %i bytes of temporary space\n", file_length*sizeof(int));
    arTemp1 = (int*)malloc(file_length*sizeof(int));
    if (arTemp1==NULL) {
      printf("  - Error: not enough memory to secondary index\n");
      exit(0);
    }
    printf("  - starting sort\n");
    csort(file, arSuffix, arTemp1, 0, 0, file_length-1, bucket);
    printf("  - freeing temporary space\n");
    free(arTemp1);
  }

  if (method==2) {
    printf("\n* Sorting index\n  - using looping counting sort\n");
    printf("  - allocating %i bytes of temporary space\n", file_length*sizeof(int));
    arTemp1 = (int*)malloc(file_length*sizeof(int));
    if (arTemp1==NULL) {
      printf("  - Error: not enough memory to secondary index\n");
      exit(0);
    }
    printf("  - starting looping counting sort\n");
    lcsort(file, arSuffix, arTemp1, file_length, bucket);
    printf("  - freeing temporary space\n");
    free(arTemp1);
  }

  if (method==3) {
    printf("\n* Sorting index\n  - using Manber-Myers algorithm\n");
    printf("  - allocating %i bytes of temporary space\n", 
	   file_length*(sizeof(int)*2+sizeof(char)*2));
    arTemp1 = (int*)malloc(file_length*sizeof(int));
    arTemp2 = (int*)malloc(file_length*sizeof(int));
    arTemp3 = (char*)malloc((file_length+1)*sizeof(char));
    arTemp4 = (char*)malloc((file_length+1)*sizeof(char));
    if (arTemp1==NULL || arTemp2==NULL || arTemp3==NULL || arTemp4==NULL) {
      printf(" - Error: not enough memory to secondary index\n");
      exit(0);
    }
    printf("  - starting sort\n");
    manberMyers(file, arSuffix, file_length, 
		arTemp1, arTemp2, arTemp3, arTemp4, bucket);
    printf("  - freeing temporary space\n");
    free(arTemp1);
    free(arTemp2);
    free(arTemp3);
    free(arTemp4);
  }

  nTime += time(NULL);

  printf("  - sorting the array took %i seconds.\n", nTime);

  if (check) {
    printf("\n* Comparative sort\n  - allocating %i bytes\n", 
	   file_length*(sizeof(int)*3+sizeof(char)*2));
    arTemp1 = (int*)malloc(file_length*sizeof(int));
    arTemp2 = (int*)malloc(file_length*sizeof(int));
    arTemp3 = (char*)malloc((file_length+1)*sizeof(char));
    arTemp4 = (char*)malloc((file_length+1)*sizeof(char));
    arCheck = (int*)malloc(file_length*sizeof(int));
    if (arTemp1==NULL || arTemp2==NULL || arTemp3==NULL 
	|| arTemp4==NULL || arCheck==NULL) {
      printf("  - Error: not enough memory for suffix array check.\n");
      exit(0);
    }
    printf("  - starting Manber-Myers\n");

    manberMyers(file, arCheck, file_length, 
		arTemp1, arTemp2, arTemp3, arTemp4, bucket);
    printf("  - done sorting.\n");

    c=0;
    check = 0;
    for (c=0; !check && ( c<file_length); c++ ) {
      if (arCheck[c] != arSuffix[c]) 
	check = 1;
    }
	
    if (check) {
      printf("  - Error!! one of the sorting algorithms is failing\n");
    }
    else {
      printf("  - sorting algorithms compare ok\n");
    }

    printf("  - freeing temporary space\n");
    free(arTemp1);
    free(arTemp2);
    free(arTemp3);
    free(arTemp4);
    free(arCheck);
  }

  nTime = -time(NULL);

  if (verbose) {
    printf("The sorted index:\n");
    for(i=0; i<file_length; i++)
      printf("%d\t%s\n",i,&file[arSuffix[i]]);
    printf("\n");
  }


  if (count) {
    nTime = -time(NULL);

    printf("\n* Searching\n  - String: \"%s\"\n", cstring);
    i = findstr(file, arSuffix, file_length, cstring);
    if (i==-1)
      printf("  - no occurence found\n");
    else {
      printf("  - counting occurences\n");
      countstr(r, file, arSuffix, file_length, cstring, i);
      printf("  - found %d occurences\n", r[1]-r[0]+1);
    }

    nTime += time(NULL);
    printf("  - search took %i seconds\n", nTime);
  }

  if (lrecsub) {
    nTime = -time(NULL);
    printf("\n* Searching for the longest repeated substring\n");
    i = longest(file, arSuffix, 0, file_length-1, 0);
    printf("  - its length = %d\n", i);   
    nTime += time(NULL);
    printf("  - search took %i seconds\n", nTime);
     
  }

  if (mcount) {
    nTime = -time(NULL);
    printf("\n* Searching with metacharacters\n  - string: \"%s\"\n",mcstring);
    i = countmetastr(arSuffix, file_length, mcstring, file);
    printf("  - found %d occurences\n",i);
    nTime += time(NULL);
    printf("  - search took %i seconds\n", nTime);
  }

  free(arSuffix);
}
