/*=======================================================================*\
 * taxicab.cc : searching multiple values of  f(a,b) = a^3+b^3
 * Copyright (C) 2003-2005 Michael Eisermann, all rights reserved
 * Institut Fourier, Université Grenoble I, France
 *
 * Please send bug reports, suggestions, and comments to
 *   Michael Eisermann <Michael.Eisermann@ujf-grenoble.fr>
 * For more information and updates see the web page
 *   http://www-fourier.ujf-grenoble.fr/~eiserm
 * If you have used my software in your research, please let me know.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or 
 * (at your option) any later version. In particular, permission to use, 
 * copy, modify, or redistribute this software for research and education 
 * is granted without fee, provided that the above copyright notice and 
 * this permission appear in all copies and supporting documentation. 
 *
 * This software is distributed in the hope that it will be useful,
 * but it is provided "as is" without any warranty, without even 
 * the implied warranty of fitness for a particular purpose. 
 * See the GNU General Public License for more details.
 *
 *-----------------------------------------------------------------------
 * Description:
 *-------------
 * This program searches for multiple values of  f(a,b) = a^3+b^3  
 * using efficient algorithms for sorted enumeration.
 *-----------------------------------------------------------------------
 * Compilation:
 *-------------
 * The implementation has been successfully tested and optimized 
 * using the GNU compiler. A typical set of options could be:
 *
 * g++ taxicab.cc -o taxicab -D_verbose -O2 -static
 *
 * Remarks:
 * -D_verbose  adds code to write initialization messages to cout.
 * -D_asm      uses inline asm code rather than substitute C code.
 * -D_heap     uses own heap implementation rather than STL algorithms.
 * -D_run=<n>  run only n periods (test) instead of long-term search.
 *
 * -O2 and -O3  optimize for speed (see the g++ documentation.)
 *  This switches on almost all available optimization options.
 * -static  increases speed but produces a very large executable.
 *  Compilation without the -static option uses shared libraries.
 *-----------------------------------------------------------------------
 * Remarks:
 *---------
 * The current implementation uses hand crafted i486 code for
 * 96bit integers (see comments in the file "sumcubes.cc"),
 * which is fast on i486 processors but not portable to others.
 * Substitute C-code is provided to ensure a portable version.
 * Their share in the overall performance seems negligible.
 *
 * Further optimization options, which are not always successful:
 * -fomit-frame-pointer  saves some register pushs and pops.
 *  It also makes debugging impossible, but that seems ok.
 * -fexpensive-optimizations  performs a number of minor optimizations 
 *  that are relatively expensive and might not be worth the effort.
 * -malign-double  aligns `long long' variables  on a two word boundary.
 *  This produces code that runs somewhat faster but is somewhat longer.
 *-----------------------------------------------------------------------
 * Profile:
 *---------
 * More than 90% of the time is spent in advancing the enumeration,
 * including 80% in heap operations, roughly 70% pop and 10% push.
 * Less than 5% is spent in the calculation of f(a,b) = a^3 + b^3. 
 * This means that optimizations should concentrate on the priority 
 * queue (faster implementation or better algorithms/data structures.)
 *
 * The following samples illustrate the point. (The fifth taxicab number 
 * is 48988659276962496, which provides a quick test of correctness.
 * One could also start below 24153319581254312065344, which is a 
 * six-way number, and possibly even the smallest one.)
 * 
 * Typical situations to test:
 *   execute: taxicab level 48953000000000000 
 *        or: taxicab level 24153319581000000000000
 *
 * Profiling with gprof:
 *   compile: g++ taxicab.cc -o taxicab -O3 -static -pg
 *   analyze: gprof -q taxicab > taxicab.prof
 *
 * Profiling with gcov:
 *   compile: g++ taxicab.cc -o taxicab -fprofile-arcs -ftest-coverage
 *   analyze: gcov taxicab.cc
 *
 *=======================================================================
 * Implementation and change history:
 *-----------------------------------------------------------------------
 * 2005/07/27, version 1.0, author: Michael Eisermann
 * - released as version 1.0
 *-----------------------------------------------------------------------
 * 2005/06/12, version 0.93, author: Michael Eisermann
 * - Optimized heap operations in "bimonotone.hh"
 *-----------------------------------------------------------------------
 * 2005/06/10, version 0.92, author: Michael Eisermann
 * - Revised assembly code in "sumcubes.cc"
 *-----------------------------------------------------------------------
 * 2005/06/01, version 0.91, author: Michael Eisermann
 * - Minor changes in order to comply with "bimonotone.hh" version 0.7.
 *-----------------------------------------------------------------------
 * 2004/08/25, version 0.9, author: Michael Eisermann
 * - Use BimStreamDiff in order to benefit from differential model
 *   (only differentials are stored in the priority queue). This uses
 *   12% less memory and speeds up priority queue handling by some 20%.
 * - Memory requirement for each item in the priority queue: 28 bytes.
 *   In detail: 8 bytes for the differential, 4 bytes for a, 
 *   4 bytes for b, plus 12 bytes for the three pointers.
 *-----------------------------------------------------------------------
 * 2004/07/28, version 0.8, author: Michael Eisermann
 * - Reorganized previous "taxicab.cc" file into two separate files:
 *   the entire calculation routine is now found in "sumcubes.cc".
 *   Sorted enumeration streams are templated in "bimonotone.hh".
 * - Analyzed profile using gprof, optimized hot spots in assembly code,
 *   see the include file "sumcubes.cc".
 * - Memory requirement for each item in the priority queue: 32 bytes.
 *   In detail: 12 bytes for the value f(a,b), 4 bytes for a, 
 *   4 bytes for b, plus 12 bytes for the three pointers.
\*=======================================================================*/

// #define _verbose // write initialization messages to cout

// Include standard libraries
#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <string>
using namespace std;

// Include templates for sorted enumeration
#include "bimonotone.hh"

// Include ad hoc implementation of 96bit integers
#include "sumcubes.cc"

typedef Function::A A;
typedef Function::B B;
typedef Function::Z Z;
typedef Function::D D;

/*-----------------------------------------------------------------------*\
 * Version information for this program
\*-----------------------------------------------------------------------*/

const string kVersion( "version 1.0" );
const string kLogFile( Function().filename()+".log" );
const string kLogBack( Function().filename()+".log~" );
const string kOutFile( Function().filename()+".out" );
const string kOutBack( Function().filename()+".out~" );

ostream& version( ostream& out )
{
  out << "taxicab -- searching multiple values of " << Function().description() << endl 
      << "Copyright (C) 2003-2005 by Michael Eisermann, all rights reserved" << endl
      << "This is " << kVersion << " compiled " << __DATE__ 
      << " " << __TIME__ 
    
#ifdef _verbose
      << ", options: " << endl

#ifdef _verbose
      << "- verbose "
#else
      << "- non verbose "
#endif
    
#ifdef _asm
      << "- inline asm "
#else
      << "- avoid asm "
#endif
    
#ifdef _heap
      << "- use own heap "
#else
      << "- use STL heap "
#endif

#ifdef _run
      << "- run " << _run << " periods "
#else
      << "- long-term search "
#endif

#endif
      << endl;
  return out;
};

ostream& usage( ostream& out )
{
  out << endl << version << endl << "Usage:" << endl
      << "taxicab level <begin>   -- start searching at a given level\n"
      << "taxicab length <begin>  -- start searching at a given length\n"
      << "taxicab continue        -- continue searching from log file\n"
      << "All input/output uses the log file \"" << kLogFile << "\"\n"
      << "Results are listed in the separate file \"" << kOutFile << "\"\n"
      << endl;
  return out;
};

#include <ctime>
ostream& date( ostream& out )
{
  time_t now= time(NULL);
  tm *date= localtime(&now);
  out << setfill('0')
      << setw(4) << date->tm_year+1900 << "/" 
      << setw(2) << date->tm_mon+1 << "/"
      << setw(2) << date->tm_mday << " "
      << setw(2) << date->tm_hour << ":"
      << setw(2) << date->tm_min << ":"
      << setw(2) << date->tm_sec
      << setfill(' ');
  return out;
};

inline unsigned long long int 
get_cpu_ticks()
{
  // This function reads the CPU clock (in ticks = CPU cycles).
  // RDTSC is the assembly code for `Read Time-Stamp Counter'.
  // For information, one can read local processor information 
  // using the command line $ cat /proc/cpuinfo | grep "cpu MHz"   
  unsigned long long int t;
  asm volatile ( "RDTSC" : "=A" (t) );
  return t;
};

/*-----------------------------------------------------------------------*\
 * Search for multiple values in a sorted enumeration stream
\*-----------------------------------------------------------------------*/

template <typename A, typename B, typename Z>
void search_repetitions( BinStream<A,B,Z>& theStream )
{
  // Specify the time interval for writing status messages to the log file 
  const int kMinInterval= 400;           // minimum time interval, eg 400 seconds
  const int kMaxInterval= 900;           // maximum time interval, eg 900 seconds
  const long long kMinIts= 1000000;      // minimum number of iterations
  long long iterations= kMinIts;         // current number of iterations
  time_t lasttime;                       // time of last log message
  double interval= 0;                    // interval between messages
  unsigned long long int ticks0, ticks1; // CPU ticks during interval

  // Open output file (remains open during search)
  ofstream out( kOutFile.c_str(), ios::out | ios::app );
  if ( !out ) 
    {
      cerr << "Cannot open \"" << kOutFile << " for writing!" << endl;
      exit(1);
    };

  // Advance in the stream, looking for multiplicities
  Z previous= theStream.value();        // previous value of the stream
  int multiplicity= 1;                  // current multiplicity
#ifdef _run
  for ( int k= 0; k < _run; ++k )       // finite loop for testing and profiling
#else
  for(;;)                               // infinite loop for long-term search
#endif
    {
      lasttime= time(NULL);
      ticks0= get_cpu_ticks();
      for ( long its=0; its<iterations; ++its )
	{
	  theStream.advance();
	  if ( previous == theStream.value() )
	    {
	      ++multiplicity;
	    }
	  else 
	    {
	      if ( multiplicity > 1 )
		{
		  if ( multiplicity > 2 )
		    out << "* " << multiplicity << " : " << previous << "\n";
		  multiplicity= 1;
		};
	      previous= theStream.value();
	    };
	};
      ticks1= get_cpu_ticks();
      interval= difftime(time(NULL),lasttime);
      
      // Important: flush output stream buffer now
      out.flush();

      // Open log file, write status message, then close log file
      ofstream log( kLogFile.c_str(), ios::out | ios::app );
      if ( log  ) 
	{
	  log << date 
	      << " level=" << theStream.value() 
	      << " length=" << theStream.length()
	      << " width=" << theStream.width();
	  if ( interval >= 10 )
	    log << " frequency=" << long(iterations/interval) << "/s";
	  log << " ticks=" << (ticks1-ticks0) / iterations << endl;
	  log.close();
	};
      
      // Make backups of out file and log file
      system( string("cp -uf "+kOutFile+" "+kOutBack+" &").c_str() );
      system( string("cp -uf "+kLogFile+" "+kLogBack+" &").c_str() );
      
      // Check time interval and adapt number of iterations if necessary
      if ( interval > kMaxInterval && iterations >= 2*kMinIts ) iterations/= 2;
      else if ( interval < kMinInterval ) iterations*= 2;
    };

  out.close();
};

/*-----------------------------------------------------------------------*\
 * Search for a word in an input stream 
 * using the algorithm of Knuth-Morris-Pratt
\*-----------------------------------------------------------------------*/

void init( const string& word, vector<int>& next )
{
  // Analyze the word for repetitions. The resulting vector
  // is used in the following search routine.
  next.resize( 0 );
  next.resize( word.size()+1, -1 );
  for ( int i=0, j=-1; i<int(word.size()); ++i, ++j, next[i]= j )
    while( j>=0 && word[j]!=word[i] ) j= next[j];
};

bool find( istream& in, const string& word, const vector<int>& next )
{
  // Search the stream for the occurence of the specified word. 
  // In order to do so, the vector  next  must be initialized as above.
  for ( int i=0; i<int(word.size()); ++i )
    {
      if ( !in ) return false;
      char letter; in >> letter;
      while ( i>=0 && word[i]!=letter ) i= next[i];
    };
  return true;
};

bool find( istream& in, const string& word )
{
  vector<int> next;
  init( word, next );
  return find( in, word, next );
};

/*=======================================================================*\
 * Read command line parameters and start search
\*=======================================================================*/

int main( int argc, char* argv[] )
{ 
  /*---------------------------------------------------------------------*\
   * If necessary display program name and help message
  \*---------------------------------------------------------------------*/
  if ( argc < 2 || argc > 3 )
    {
      cerr << usage;
      return 0;
    };

  if ( string(argv[1]) == string("-h") 
       || string(argv[1]) == string("--help") )
    {
      cout << usage;
      return 0;
    };
    
  if ( string(argv[1]) == string("-v") 
       || string(argv[1]) == string("--version") )
    {
      cout << version;
      return 0;
    };

  /*---------------------------------------------------------------------*\
   * Test write access and create files if necessary
  \*---------------------------------------------------------------------*/
  {
    ofstream log( kLogFile.c_str(), ios::out | ios::app ); 
    if ( !log ) 
      {
	cerr << "Cannot open \"" << kLogFile << " for writing!" << endl;
	exit(1);
      };
    log.close();
  }
  
  {
    ofstream log( kLogBack.c_str(), ios::out | ios::app ); 
    if ( !log ) 
      {
	cerr << "Cannot open \"" << kLogBack << " for writing!" << endl;
	exit(1);
      };
    log.close();
  }
  
  {
    ofstream out( kOutFile.c_str(), ios::out | ios::app ); 
    if ( !out ) 
      {
	cerr << "Cannot open \"" << kOutFile << " for writing!" << endl;
	exit(1);
      };
    out.close();
  }

  {
    ofstream out( kOutBack.c_str(), ios::out | ios::app ); 
    if ( !out ) 
      {
	cerr << "Cannot open \"" << kOutBack << " for writing!" << endl;
	exit(1);
      };
    out.close();
  }

  /*---------------------------------------------------------------------*\
   * Define the search stream
   *---------------------------------------------------------------------
   * Optimization:
   * In order to save time and space we employ here a differential model:
   * only the differentials are stored in the priority queue. This allows 
   * us to choose a type of smaller size, unsigned long long int (64bits),
   * capable of storing integers from 0 to a little more than 1.8446e19.
   *---------------------------------------------------------------------
   * What values are to be expected?
   * We are streaming values of f(a,b) = a^3 + b^3 that are smaller than
   * T =  24 153 319 581 254 312 065 344, where T is the smallest known 
   * integer that can be written as sum of two positive cubes in at least
   * six different ways. The differentials f(a+1,b)-f(a,b) = 3a(a+1)+1 
   * that appear in this range are bounded by 2.6e15, since the worst
   * case appears for a=2.9e7, b=1.
   *---------------------------------------------------------------------
   * How to avoid overflow?
   * When updating the enumeration stream, one has to check the first
   * differential (at the front of the priority queue). As soon as it
   * gets larger than the prescribed tolerance, all differentials in 
   * the queue are reduced and the offset is updated. To be efficient, 
   * one should choose the tolerance as large as possible, but still 
   * small enough to avoid overflow.
   *---------------------------------------------------------------------
   * Conclusion:
   * With a tolerance of 18443e15, even the maximal differential 2.6e15 
   * cannot possibly cause an overflow.
  \*---------------------------------------------------------------------*/
  BimStreamDiff<Function> theStream( 18443000000000000000ULL );
  // BimStream<Function> theStream;

  /*---------------------------------------------------------------------*\
   * Initialize the search stream (read level from the command line)
  \*---------------------------------------------------------------------*/
  if ( string(argv[1]) == string("level") )
    {
      // Read starting level from command line
      if ( argc < 3 )
	{
	  cerr << "option \"level\" needs a parameter\n" << usage;
	  exit(1);
	};
      Z begin= Z(string(argv[2]));
#ifdef _verbose
      cout << version << date 
	   << " initializing stream to level " << begin << endl;
#endif
      theStream.reset( begin );
#ifdef _verbose
      cout << date 
	   << " level=" << theStream.value() 
	   << " length=" << theStream.length()
	   << " width=" << theStream.width() 
	   << endl << endl;
#endif
    }

  /*---------------------------------------------------------------------*\
   * Continue previous search (read last level from the log file)
  \*---------------------------------------------------------------------*/
  else if ( string(argv[1]) == string("continue") )
    {
      // If need be, try to recover damaged log files and output files
      // (assuming that the larger file is the better one)
      system( string("cp -f $( ls -S "+kLogFile+" "+kLogBack+" )").c_str() );
      system( string("cp -f $( ls -S "+kOutFile+" "+kOutBack+" )").c_str() );
      
      // Try to open log file for reading
      ifstream in( kLogFile.c_str(), ios::in );
      if ( !in ) 
	{
	  cerr << "Cannot open \"" << kLogFile << "\" for reading!" << endl;
	  exit(1);
	};
      
      // Read last level in order to continue without loss
      string word("level=");
      vector<int> next;
      init( word, next );
      Z begin(0ul);
      while ( find( in, word, next ) ) in >> begin;
      in.close();
#ifdef _verbose
      cout << version << date
	   << " continuing at level " << begin << endl;
#endif
      theStream.reset( begin );
#ifdef _verbose
      cout << date 
	   << " level=" << theStream.value() 
	   << " length=" << theStream.length()
	   << " width=" << theStream.width() 
	   << endl << endl;
#endif
    }

  /*---------------------------------------------------------------------*\
   * Initialize the search stream to a certain length
   * (This is an ad hoc solution and by no means efficient.)
  \*---------------------------------------------------------------------*/
  else if ( string(argv[1]) == string("length") )
    {
      // Read starting length from command line
      if ( argc < 3 )
	{
	  cerr << "option \"length\" needs a parameter\n" << usage;
	  exit(1);
	};

      BimStreamDiff<Function>::length_type length, lmin, lmax, lmid, skip(500000);
      // BimStream<Function>::length_type length, lmin, lmax, lmid, skip(500000);
      istringstream in(argv[2]);
      in >> length;
      cout << version << date 
	   << " initializing stream to length " << length << endl;
      
      // Calculate the starting level (via a hybrid binary/linear search)
      Z min(1ul), max(2ul), one(1ul);
      theStream.reset(max);
      lmin= 0; lmax= theStream.length();
      while ( length > lmin+skip && length >= lmax )
	{
	  min= max;
	  lmin= lmax;
	  max*= 2;
	  theStream.reset(max);
	  lmax= theStream.length();
	  /*
	    #ifdef _verbose
	    cout << "level " << min << "   \r" << flush;
	    #endif
	  */
	}
      while ( length > lmin+skip && max > min+one )
	{
	  Z mid(min+max);
	  mid/= 2;
	  theStream.reset(mid);
	  lmid= theStream.length();
	  if ( length < lmid ) 
	    {
	      max= mid;
	      lmax= lmid;
	    }
	  else
	    {
	      min= mid;
	      lmin= lmid;
	    };
	  /*
	    #ifdef _verbose
	    cout << "level " << min << " ... " << max << "   \r" << flush;
	    #endif
	  */
	}
      theStream.reset(min);
      while ( theStream.length() < length ) theStream.advance();
#ifdef _verbose
      cout << date 
	   << " level=" << theStream.value() 
	   << " length=" << theStream.length()
	   << " width=" << theStream.width() 
	   << endl << endl;
#endif
    }
  else 
    {
      cerr << "Unknown option \"" << argv[1] << "\"\n" << usage;
      exit(1);      
    };
  
  /*---------------------------------------------------------------------*\
   * Write initialization message to the log file and the out file
  \*---------------------------------------------------------------------*/
  ofstream log( kLogFile.c_str(), ios::out | ios::app );
  if ( log )
    {
      log << endl << version << date 
	  << " level=" << theStream.value() 
	  << " length=" << theStream.length()
	  << " width=" << theStream.width() << endl;
      log.close();
    };

  ofstream out( kOutFile.c_str(), ios::out | ios::app );
  if ( out )
    {
      out << endl << date 
	  << " level=" << theStream.value() 
	  << " length=" << theStream.length()
	  << " width=" << theStream.width() << endl;
      out.close();
    };
  
  /*---------------------------------------------------------------------*\
   * Search for multiple values
  \*---------------------------------------------------------------------*/
  search_repetitions(theStream);
};

/*=======================================================================*\
 * end of file "taxicab.cc"
\*=======================================================================*/

