/*=======================================================================*\
 * sumcubes.cc : calculate a^3+b^3 with 96bit integers
 * Copyright (C) 2003-2005 Michael Eisermann, all rights reserved
 * Institut Fourier, Université Grenoble I, France
 *
 * Please send bug reports, suggestions, and comments to
 *   Michael Eisermann <Michael.Eisermann@ujf-grenoble.fr>
 * For more information and updates see the web page
 *   http://www-fourier.ujf-grenoble.fr/~eiserm
 * If you have used my software in your research, please let me know.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or 
 * (at your option) any later version. In particular, permission to use, 
 * copy, modify, or redistribute this software for research and education 
 * is granted without fee, provided that the above copyright notice and 
 * this permission appear in all copies and supporting documentation. 
 *
 * This software is distributed in the hope that it will be useful,
 * but it is provided "as is" without any warranty, without even 
 * the implied warranty of fitness for a particular purpose. 
 * See the GNU General Public License for more details.
 *
 *=======================================================================
 * Description:
 *-------------
 * The class  ulllint  provides 96bit integers, together with optimized
 * assembly code for i486 processors, in order to represent integer values 
 * ranging from 0 to 7.9e28. The additional class  Function  provides 
 * the calculation of a^3+b^3, the sum of cubes mentioned in the title.
 *-----------------------------------------------------------------------
 * Implementation and optimization issues:
 *----------------------------------------
 * The class ulllint is far less general and less comfortable than GMP 
 * integers, it is not portable (it is almost hard-wired ;-), but objects 
 * require less memory and handling is several times faster. To this end
 * I have tried to hand-craft some low-level functions in assembly code.
 * The downside is that it runs only on i486 processors. 
 *
 * Substitute C-code is provided to ensure a portable version, even 
 * though slightly less efficient. To compare both approaches,
 * the substitute C-code can be compiled with option -S, and the
 * resulting assembly code can then be checked and tuned by hand.
 *
 * For argument passing we use gcc's extended asm facility, which 
 * is a clean and efficient solution. For details see the info pages:
 *   info gcc "C Extensions" "Extended Asm"
 *=======================================================================
 * Implementation and change history (in reverse chronological order)
 *-----------------------------------------------------------------------
 * 2005/06/11, version 0.4, author: Michael Eisermann
 * - Extensively revised assembly code, using gcc's extended asm.
 *   This resolves conflicts between optimization and asm inlining.
 *-----------------------------------------------------------------------
 * 2005/06/01, version 0.3, author: Michael Eisermann
 * - Minor changes in order to comply with "bimonotone.hh" version 0.7:
 *   modified interface for bimonotone functions derived from BimABtoZ.
 *-----------------------------------------------------------------------
 * 2004/08/24, version 0.2, author: Michael Eisermann
 * - Analyzed profile using gprof, optimized assembly code in comparisons
 *-----------------------------------------------------------------------
 * 2004/07/28, version 0.1, author: Michael Eisermann
 * - Implemented (some of) the basic methods for an int-like type
 * - Optimized comparison, addition, subtraction and simple
 *   multiplication/division in assembly code (using embedded asm)
\*=======================================================================*/

#include <functional>
#include <string>
using namespace std;

// #define _asm  // use inline asm code where possible
// #undef _asm   // use C code instead of asm code

/*-----------------------------------------------------------------------*\
 * We first define data types ulint and ullint that must be provided by 
 * the compiler to cater for 32bit and 64bit integers. Building on these, 
 * we then implement 96bit integers (to efficiently calculate a^3+b^3).
\*-----------------------------------------------------------------------*/

// type for 32bit integers
typedef unsigned long int ulint;

// type for 64bit integers
typedef unsigned long long int ullint;

// type for 96bit integers
class ulllint
{
public:
  // Data elements: three unsigned long integers (in big-endian order)
  ulint a0,a1,a2;

private:
  // Private functions (embedded assembly code for optimal performance)
  static  int   cmp3int ( const ulint* op1, const ulint* op2 );
  static  bool  eq3int  ( const ulint* op1, const ulint* op2 );
  static  bool  ne3int  ( const ulint* op1, const ulint* op2 );
  static  bool  le3int  ( const ulint* op1, const ulint* op2 );
  static  bool  lt3int  ( const ulint* op1, const ulint* op2 );
  static  bool  ge3int  ( const ulint* op1, const ulint* op2 );
  static  bool  gt3int  ( const ulint* op1, const ulint* op2 );
  static  void  add3int ( ulint* dest, const ulint* source );
  static  void  add3int2( ulint* dest, const ullint& op );
  static  void  neg3int ( ulint* dest );
  static  void  sub3int ( ulint* dest, const ulint* source );
  static  void  mul3int ( ulint* dest, ulint factor );
  static  ulint div3int ( ulint* dest, ulint divisor );

public:
  // Standard constructors to convert from primitive C++ types 
  ulllint() 
    : a0(0ul), a1(0ul), a2(0ul) {};
  ulllint( ulint a ) 
    : a0(a),  a1(0ul), a2(0ul) {};
  ulllint( ullint a ) 
    : a0(ulint(a)), a1(ulint(a>>32)), a2(0ul) {};
  ulllint( const ulllint& source )
    : a0(source.a0), a1(source.a1), a2(source.a2) {};
  ulllint( const string& source )
    { read_string(source); };

  // Standard conversions
  operator ulint() const
    { return a0; };
  operator ullint() const
    { return *((ullint*)(&a0)); };
  operator string() const
    { string s; write_string(s); return s; };

  // Assignement operators
  void reset(void)
    { a0= 0ul; a1= 0ul; a2= 0ul; };
  ulllint& operator = ( ulint a )
    { a0= a; a1= 0ul; a2= 0ul; return *this; };
  ulllint& operator = ( ullint a )
    { a0= ulint(a); a1= ulint(a>>32); a2= 0ul; return *this; };
  ulllint& operator = ( const ulllint& source )
    { a0= source.a0; a1= source.a1; a2= source.a2; return *this; };
  ulllint& operator = ( const string& source )
    { read_string(source); return *this; };

  // Comparison operators
  bool is_zero(void) const
    { return !( a0 | a1 | a2 ); };
  friend int compare( const ulllint& op1, const ulllint& op2 )
    { return cmp3int( &(op1.a0), &(op2.a0) ); };

  bool operator == ( const ulllint& op ) const
    { return eq3int( &a0, &(op.a0) ); };
  bool operator != ( const ulllint& op ) const
    { return ne3int( &a0, &(op.a0) ); };
  bool operator <= ( const ulllint& op ) const
    { return le3int( &a0, &(op.a0) ); };
  bool operator <  ( const ulllint& op ) const
    { return lt3int( &a0, &(op.a0) ); };
  bool operator >= ( const ulllint& op ) const
    { return ge3int( &a0, &(op.a0) ); };
  bool operator >  ( const ulllint& op ) const
    { return gt3int( &a0, &(op.a0) ); };

  // Addition 
  ulllint& operator += ( const ulllint& op )
    { add3int( &a0, &(op.a0) ); return *this; };
  ulllint operator + ( const ulllint& op ) const
    { ulllint result(*this); add3int( &(result.a0), &(op.a0) ); return result; };
  ulllint operator += ( const ullint& op )
    { add3int2( &a0, op ); return *this; };
  ulllint operator + ( const ullint& op ) const
    { ulllint result(*this); add3int2( &(result.a0), op ); return result; };

  // Subtraction and negation
  ulllint& operator -= ( const ulllint& op )
    { sub3int( &a0, &(op.a0) ); return *this; };
  ulllint operator - ( const ulllint& op ) const
    { ulllint result(*this); sub3int( &(result.a0), &(op.a0) ); return result; };
  ulllint operator - () const
    { ulllint result(*this); neg3int( &(result.a0) ); return result; };

  ulllint& operator -= ( const ullint& op )
    { return operator-= ( ulllint(op) ); };
  ulllint operator - ( const ullint& op ) const
    { return (*this) - ulllint(op); };

  // Multiplication
  ulllint& operator *= ( ulint factor )
    { mul3int( &a0, factor ); return *this; };
  ulllint operator * ( ulint factor ) const
    { ulllint result(*this); mul3int( &(result.a0), factor ); return result; };

  // Division with remainder
  ulllint& operator /= ( ulint divisor )
    { div3int( &a0, divisor ); return *this; };
  ulllint operator / ( ulint divisor ) const
    { ulllint result(*this); div3int( &(result.a0), divisor ); return result; };
  ulint operator % ( ulint divisor ) const
    { ulllint temp(*this); return div3int( &(temp.a0), divisor ); };

  // Conversion to and from strings
  void read_string( const string& source );
  void write_string( string& dest ) const;
};

/*=======================================================================*\
 * class ulllint: implementation of member functions
\*=======================================================================*/

// Conversion of string to ulllint
void 
ulllint::read_string( const string& source )
{
  reset();
  ulllint digit;
  for ( string::const_iterator it= source.begin(); it!=source.end(); ++it )
    {
      if ( !isdigit(*it) ) return;
      mul3int( &a0, 10l );
      digit.a0= ulint(*it-'0');
      add3int( &a0, &(digit.a0) );
    };
};

// Conversion of ulllint to string
void 
ulllint::write_string( string& dest ) const
{
  if ( is_zero() ) 
    {
      dest= "0";
      return;
    };
  dest= string();
  ulllint dummy(*this);
  do
    {
      ulint rest= div3int( &(dummy.a0), 10l );
      dest= char(rest+'0') + dest;
    }
  while ( !dummy.is_zero() );
};

// Input operators (via conversion to and from strings)
istream& 
operator >> ( istream& in, ulllint& op )
{
  string buffer;
  in >> buffer;
  op.read_string(buffer);
  return in;
};

// Output operators (via conversion to and from strings)
ostream& 
operator << ( ostream& out, ulllint op )
{
  string buffer;
  op.write_string(buffer);
  out << buffer;
  return out;
};

/*-----------------------------------------------------------------------*\
 * The following low-level functions are implemented in assembly code 
 * in order to obtain optimal performance. This seems reasonable because
 * it is only applied to a handful of functions that do 95% of the work.
\*-----------------------------------------------------------------------*/

// Compare two 96bit integers
inline int 
ulllint::cmp3int( const ulint* op1, const ulint* op2 )
{
  // The comparison given in C is correct and a good compiler
  // should have no difficulties producing optimal assembly code.
  // But beware of surprises! Since comparison is so often used,
  // it seems safer to manually provide an optimized solution.

#ifndef _asm

  asm("### Compare two triple-ints");
  if ( op1[2] > op2[2] ) return 1;
  if ( op1[2] < op2[2] ) return -1;
  if ( op1[1] > op2[1] ) return 1;
  if ( op1[1] < op2[1] ) return -1;
  if ( op1[0] > op2[0] ) return 1;
  if ( op1[0] < op2[0] ) return -1;
  return 0;

#else

  int result;
  asm volatile
    ( "### Compare two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax holds temporary values and the final result \n\t"
      
      "movl 8(%%edi),%%eax  \n\t"
      "cmpl %%eax,8(%%esi)  \n\t"
      "jb 1f                \n\t"
      "ja 2f                \n\t"
      
      "movl 4(%%edi),%%eax  \n\t"
      "cmpl %%eax,4(%%esi)  \n\t"
      "jb 1f                \n\t"
      "ja 2f                \n\t"
      
      "movl (%%edi),%%eax   \n\t"
      "cmpl %%eax,(%%esi)   \n\t"
      "jb 1f                \n\t"
      "ja 2f                \n\t"
      "xorl %%eax,%%eax     \n\t"
      "jmp 9f               \n\t"

      "\n2:\t# return minus one \n\t" 
      "movl $-1,%%eax       \n\t"
      "jmp 9f               \n\t"

      "\n1:\t# return plus one \n\t" 
      "movl $1,%%eax        \n\t"
      "\n9:\t# exit         \n\t" 
      
      : "=a" (result) /* output */
      : "D" (op1), "S" (op2) /* input */ 
      : "cc" /* clobbered */ );
  return result;

#endif
};

// Compare two 96bit integers: check whether op1 == op2
inline bool 
ulllint::eq3int( const ulint* op1, const ulint* op2 )
{
#ifndef _asm

  asm("### Equal? Compare two triple-ints");
  if ( op1[2] != op2[2] ) return false;
  if ( op1[1] != op2[1] ) return false;
  if ( op1[0] != op2[0] ) return false;
  return true;

#else

  bool result;
  asm volatile
    ( "### Equal? Compare two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax holds temporary values and the final result \n\t"
      
      "movl 8(%%edi),%%eax  \n\t"
      "cmpl %%eax,8(%%esi)  \n\t"
      "jne 0f               \n\t"
      
      "movl 4(%%edi),%%eax  \n\t"
      "cmpl %%eax,4(%%esi)  \n\t"
      "jne 0f               \n\t"
      
      "movl (%%edi),%%eax   \n\t"
      "cmpl %%eax,(%%esi)   \n\t"
      "jne 0f               \n\t"

      "\n1:\t# return true  \n\t" 
      "movl $1,%%eax        \n\t"
      "jmp 9f               \n\t"
      "\n0:\t# return false \n\t" 
      "xorl %%eax,%%eax     \n\t"
      "\n9:\t# exit         \n\t" 

      : "=a" (result) /* output */
      : "D" (op1), "S" (op2) /* input */ 
      : "cc" /* clobbered */ );
  return result;
      
#endif
};

// Compare two 96bit integers: check whether op1 != op2
inline bool 
ulllint::ne3int( const ulint* op1, const ulint* op2 )
{
#ifndef _asm

  asm("### Not equal? Compare two triple-ints");
  if ( op1[2] != op2[2] ) return true;
  if ( op1[1] != op2[1] ) return true;
  if ( op1[0] != op2[0] ) return true;
  return false;

#else

  bool result;
  asm volatile
    ( "### Not equal? Compare two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax holds temporary values and the final result \n\t"
      
      "movl 8(%%edi),%%eax  \n\t"
      "cmpl %%eax,8(%%esi)  \n\t"
      "jne 1f               \n\t"
      
      "movl 4(%%edi),%%eax  \n\t"
      "cmpl %%eax,4(%%esi)  \n\t"
      "jne 1f               \n\t"
      
      "movl (%%edi),%%eax   \n\t"
      "cmpl %%eax,(%%esi)   \n\t"
      "jne 1f               \n\t"

      "\n0:\t# return false \n\t" 
      "xorl %%eax,%%eax     \n\t"
      "jmp 9f               \n\t"
      "\n1:\t# return true  \n\t" 
      "movl $1,%%eax        \n\t"
      "\n9:\t# exit         \n\t" 

      : "=a" (result) /* output */
      : "D" (op1), "S" (op2) /* input */ 
      : "cc" /* clobbered */ );
  return result;

#endif
};

// Compare two 96bit integers: check whether op1 <= op2
inline bool
ulllint::le3int( const ulint* op1, const ulint* op2 )
{
#ifndef _asm

  asm("### Less or equal? Compare two triple-ints");
  if ( op1[2] > op2[2] ) return false;
  if ( op1[2] < op2[2] ) return true;
  if ( op1[1] > op2[1] ) return false;
  if ( op1[1] < op2[1] ) return true;
  if ( op1[0] > op2[0] ) return false;
  return true;

#else

  bool result;
  asm volatile
    ( "### Less or equal? Compare two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax holds temporary values and the final result \n\t"
      
      "movl 8(%%edi),%%eax  \n\t"
      "cmpl %%eax,8(%%esi)  \n\t"
      "ja 1f                \n\t"
      "jb 0f                \n\t"
      
      "movl 4(%%edi),%%eax  \n\t"
      "cmpl %%eax,4(%%esi)  \n\t"
      "ja 1f                \n\t"
      "jb 0f                \n\t"
      
      "movl (%%edi),%%eax   \n\t"
      "cmpl %%eax,(%%esi)   \n\t"
      "jb 0f                \n\t"

      "\n1:\t# return true  \n\t" 
      "movl $1,%%eax        \n\t"
      "jmp 9f               \n\t"
      "\n0:\t# return false \n\t" 
      "xorl %%eax,%%eax     \n\t"
      "\n9:\t# exit         \n\t" 

      : "=a" (result) /* output */
      : "D" (op1), "S" (op2) /* input */ 
      : "cc" /* clobbered */ );
  return result;

#endif
};

// Compare two 96bit integers: check whether op1 < op2
inline bool 
ulllint::lt3int( const ulint* op1, const ulint* op2 )
{
#ifndef _asm

  asm("### Less? Compare two triple-ints");
  if ( op1[2] < op2[2] ) return true;
  if ( op1[2] > op2[2] ) return false;
  if ( op1[1] < op2[1] ) return true;
  if ( op1[1] > op2[1] ) return false;
  if ( op1[0] < op2[0] ) return true;
  return false;

#else

  bool result;
  asm volatile
    ( "### Less? Compare two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax holds temporary values and the final result \n\t"
      
      "movl 8(%%edi),%%eax  \n\t"
      "cmpl %%eax,8(%%esi)  \n\t"
      "ja 1f                \n\t"
      "jb 0f                \n\t"
      
      "movl 4(%%edi),%%eax  \n\t"
      "cmpl %%eax,4(%%esi)  \n\t"
      "ja 1f                \n\t"
      "jb 0f                \n\t"
      
      "movl (%%edi),%%eax   \n\t"
      "cmpl %%eax,(%%esi)   \n\t"
      "ja 1f                \n\t"

      "\n0:\t# return false \n\t" 
      "xorl %%eax,%%eax     \n\t"
      "jmp 9f               \n\t"
      "\n1:\t# return true  \n\t" 
      "movl $1,%%eax        \n\t"
      "\n9:\t# exit         \n\t" 

      : "=a" (result) /* output */
      : "D" (op1), "S" (op2) /* input */ 
      : "cc" /* clobbered */ );
  return result;

#endif
};

// Compare two 96bit integers: check whether op1 <= op2
inline bool 
ulllint::ge3int( const ulint* op1, const ulint* op2 )
{
#ifndef _asm

  asm("### Greater or equal? Compare two triple-ints");
  if ( op1[2] < op2[2] ) return false;
  if ( op1[2] > op2[2] ) return true;
  if ( op1[1] < op2[1] ) return false;
  if ( op1[1] > op2[1] ) return true;
  if ( op1[0] < op2[0] ) return false;
  return true;

#else

  bool result;
  asm volatile
    ( "### Greater or equal? Compare two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax holds temporary values and the final result \n\t"
      
      "movl 8(%%edi),%%eax  \n\t"
      "cmpl %%eax,8(%%esi)  \n\t"
      "jb 1f                \n\t"
      "ja 0f                \n\t"
      
      "movl 4(%%edi),%%eax  \n\t"
      "cmpl %%eax,4(%%esi)  \n\t"
      "jb 1f                \n\t"
      "ja 0f                \n\t"
      
      "movl (%%edi),%%eax   \n\t"
      "cmpl %%eax,(%%esi)   \n\t"
      "ja 0f                \n\t"

      "\n1:\t# return true  \n\t" 
      "movl $1,%%eax        \n\t"
      "jmp 9f               \n\t"
      "\n0:\t# return false \n\t" 
      "xorl %%eax,%%eax     \n\t"
      "\n9:\t# exit         \n\t" 

      : "=a" (result) /* output */
      : "D" (op1), "S" (op2) /* input */ 
      : "cc" /* clobbered */ );
  return result;

#endif
};

// Compare two 96bit integers: check whether op1 < op2
inline bool 
ulllint::gt3int( const ulint* op1, const ulint* op2 )
{
#ifndef _asm

  asm("### Greater? Compare two triple-ints");
  if ( op1[2] > op2[2] ) return true;
  if ( op1[2] < op2[2] ) return false;
  if ( op1[1] > op2[1] ) return true;
  if ( op1[1] < op2[1] ) return false;
  if ( op1[0] > op2[0] ) return true;
  return false;
  
#else

  bool result;
  asm volatile
    ( "### Greater? Compare two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax holds temporary values and the final result \n\t"
      
      "movl 8(%%edi),%%eax  \n\t"
      "cmpl %%eax,8(%%esi)  \n\t"
      "jb 1f                \n\t"
      "ja 0f                \n\t"
      
      "movl 4(%%edi),%%eax  \n\t"
      "cmpl %%eax,4(%%esi)  \n\t"
      "jb 1f                \n\t"
      "ja 0f                \n\t"
      
      "movl (%%edi),%%eax   \n\t"
      "cmpl %%eax,(%%esi)   \n\t"
      "jb 1f                \n\t"

      "\n0:\t# return false \n\t" 
      "xorl %%eax,%%eax     \n\t"
      "jmp 9f               \n\t"
      "\n1:\t# return true  \n\t" 
      "movl $1,%%eax        \n\t"
      "\n9:\t# exit         \n\t" 

      : "=a" (result) /* output */
      : "D" (op1), "S" (op2) /* input */ 
      : "cc" /* clobbered */ );
  return result;

#endif
};

// Addition of two 96bit integers (ulllint)
inline void 
ulllint::add3int( ulint* dest, const ulint* source )
{
  // The addition given in C is correct but not optimally coded.
  // Note, in particular, that in C we cannot access the carry bit. 
  // This problem is easily corrected in the assembly code below.

#ifndef _asm

  asm("### Add two triple-ints");
  ullint sum;

  sum= dest[0];
  sum+= source[0];
  dest[0] = ulint(sum);
  sum>>= 32; // carry

  sum+= dest[1];
  sum+= source[1];
  dest[1] = ulint(sum);
  sum>>= 32; // carry
  
  sum+= dest[2];
  sum+= source[2];
  dest[2] = ulint(sum);
  // sum>>= 32; // carry
  // return sum;

#else

  asm volatile
    ( "### Add two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax will be used as temporary register \n\t"
      "movl (%%esi),%%eax   \n\t"
      "addl %%eax,(%%edi)   \n\t"
      "movl 4(%%esi),%%eax  \n\t"
      "adcl %%eax,4(%%edi)  \n\t"
      "movl 8(%%esi),%%eax  \n\t"
      "adcl %%eax,8(%%edi)  \n\t"
      
      : /* output */
      : "D" (dest), "S" (source) /* input */
      : "%eax", "memory", "cc" /* clobbered */ );

#endif
};

// Addition of a 96bit integer (ulllint) and a 64bit integer (ullint)
inline void 
ulllint::add3int2( ulint* dest, const ullint& op )
{
  // The addition given in C is correct but not optimally coded.
  // Note, in particular, that in C we cannot access the carry bit. 
  // This problem is easily corrected in the assembly code below.

#ifndef _asm

  asm("### Add a triple-int and a double-int");
  ullint sum;

  sum= dest[0];
  sum+= ulint(op);
  dest[0] = ulint(sum);
  sum>>= 32; // carry

  sum+= dest[1];
  sum+= ulint(op>>32);
  dest[1] = ulint(sum);
  sum>>= 32; // carry
  
  sum+= dest[2];
  dest[2] = ulint(sum);
  // sum>>= 32; // carry
  // return sum;

#else

  asm volatile
    ( "### Add a triple-int and a double-int \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%eax,%%edx hold the second term \n\t"
      "addl %%eax,(%%edi)   \n\t"
      "adcl %%edx,4(%%edi)  \n\t"
      "adcl $0,8(%%edi)     \n\t"

      : /* output */
      : "D" (dest), "a" (ulint(op)), "d" (ulint(op>>32)) /* input */
      : "memory", "cc" /* clobbered */ );

#endif
};

// Subtraction of two 96bit integers (ulllint)
inline void 
ulllint::sub3int( ulint* dest, const ulint* source )
{
#ifndef _asm
  
  asm("### Subtract two triple-ints");
  ulllint temp( *(ulllint*)(source) );
  neg3int(&(temp.a0));
  add3int(dest,&(temp.a0));

#else

  asm volatile
    ( "### Subtract two triple-ints \n\t"
      "### %%edi holds the destination address (first term) \n\t"
      "### %%esi holds the source address (second term) \n\t"
      "### %%eax will be used as temporary register \n\t"
      "movl (%%esi),%%eax   \n\t"
      "subl %%eax,(%%edi)   \n\t"
      "movl 4(%%esi),%%eax  \n\t"
      "sbbl %%eax,4(%%edi)  \n\t"
      "movl 8(%%esi),%%eax  \n\t"
      "sbbl %%eax,8(%%edi)  "

      : /* output */
      : "D" (dest), "S" (source) /* input */
      : "%eax", "memory", "cc" /* clobbered */ );

#endif
};

// Negate a 96int integer (ulllint) : bit complement + 1
inline void 
ulllint::neg3int( ulint* dest )
{
#ifndef _asm

  asm("### Negate triple-int");
  ullint sum(1ul);
  sum+= ~dest[0];
  dest[0] = ulint(sum);
  sum>>= 32; // carry
  sum+= ~dest[1];
  dest[1] = ulint(sum);
  sum>>= 32; // carry
  sum+= dest[2];
  dest[2] = ulint(sum);
  // sum>>= 32; // carry
  // return sum;
  return;

#else

  asm volatile
    ( "### Negate triple-int \n\t"
      "### %%edi holds the source/destination address \n\t"
      "### %%eax will hold the words to negate \n\t"
      
      "movl (%%edi),%%eax  \n\t"
      "notl %%eax          \n\t"
      "addl $1,%%eax       \n\t"
      "movl %%eax,(%%edi)  \n\t"
      
      "movl 4(%%edi),%%eax \n\t"
      "notl %%eax          \n\t"
      "adcl $0,%%eax       \n\t"
      "movl %%eax,4(%%edi) \n\t"
      
      "movl 8(%%edi),%%eax \n\t"
      "notl %%eax          \n\t"
      "adcl $0,%%eax       \n\t"
      "movl %%eax,8(%%edi) "
      
      : /* output */
      : "D" (dest) /* input */
      : "%eax", "memory", "cc" /* clobbered */ );

#endif
};

// Multiplication of a 96bit integer (ulllint) by a 32bit integer (ulint)
inline void 
ulllint::mul3int( ulint* dest, ulint factor )
{
  // The multiplication in C is correct but not optimally coded.
  // A good compiler would possibly produce the assembly code given below.
  // Notice also that a possible overflow is ignored. This could easily
  // be corrected by returning the last carry as the function value.

#ifndef _asm

  asm("### Multiply triple-int by single-int ");
  ullint product;

  product  = ullint(dest[0]) * factor;
  dest[0]  = ulint(product);
  product>>= 32;

  product += ullint(dest[1]) * factor;
  dest[1]  = ulint(product);
  product>>= 32;
  
  product += ullint(dest[2]) * factor;
  dest[2] = ulint(product);
  // product>>= 32;
  // return product;
  return;

#else

  asm volatile
    ( "### Multiply triple-int by single-int \n\t"
      "### %%edi holds the source/destination address \n\t"
      "### %%ebx holds the factor with which to multiply \n\t"
      "### %%eax and %%edx will hold the double-word product \n\t"
      "### %%ecx will hold the carry word \n\t"

      "movl (%%edi),%%eax  \n\t"
      "mull %%ebx          \n\t"
      "movl %%edx,%%ecx    \n\t"
      "movl %%eax,(%%edi)  \n\t"
      
      "movl 4(%%edi),%%eax \n\t"
      "mull %%ebx          \n\t"
      "addl %%ecx,%%eax    \n\t"
      "adcl $0,%%edx       \n\t"
      "movl %%edx,%%ecx    \n\t"
      "movl %%eax,4(%%edi) \n\t"
      
      "movl 8(%%edi),%%eax \n\t"
      "mull %%ebx          \n\t"
      "addl %%ecx,%%eax    \n\t"
      "#adcl $0,%%edx      \n\t"
      "#movl %%edx,%%ecx   \n\t"
      "movl %%eax,8(%%edi) "
      
      : /* output */
      : "D" (dest), "b" (factor) /* input */
      : "%eax", "%edx", "%ecx", "memory", "cc" /* clobbered */ );

#endif
};

// Division with remainder of a 96bit integer by a 32bit integer
inline ulint 
ulllint::div3int( ulint* dest, const ulint divisor )
{
  // Division is correct but certainly not optimally coded.
  // An optimized assembler routine would perform much better...

  asm("### <div3int, compiled>");
  ullint rest;

  rest= dest[2];
  dest[2]/= divisor;
  rest-= dest[2] * divisor;
  
  rest<<=32;
  rest+= dest[1];
  dest[1]= rest / divisor;
  rest-= dest[1] * divisor;

  rest<<=32;
  rest+= dest[0];
  dest[0]= rest / divisor;
  rest-= dest[0] * divisor;

  return ulint(rest);
};


/*=======================================================================*\
 * Define the function f(a,b) = a^3 + b^3
 *=======================================================================
 * In order to comply with the template requirements in "bimonotone.hh"
 * this will be a function object whose class is derived from BimABtoZ.
 * For the algorithms to work correctly, we require that f : X -> Z be 
 * a proper bimonotone function defined on a bimonotone domain X.
 * (See the comments for the base class BimABtoZ in "bimonotone.hh".)
 * These conditions are satisfied by the implementation that follows.
\*=======================================================================*/
#include "bimonotone.hh"
class Function : BimABtoZ<ulint,ulint,ulllint,ullint>
{
public:
  // Description and filename to be used for this function
  string description() const
    { return string("f(a,b) = a^3+b^3 with 1 <= a <= b"); };
  string filename() const
    { return string("a^3+b^3"); };

  // Typenames inherited from BimABtoZ
  typedef BimABtoZ<ulint,ulint,ulllint,ullint>::A A;
  typedef BimABtoZ<ulint,ulint,ulllint,ullint>::B B;
  typedef BimABtoZ<ulint,ulint,ulllint,ullint>::Z Z;
  typedef BimABtoZ<ulint,ulint,ulllint,ullint>::D D;

  // Evaluate the function f(a,b) = a^3 + b^3
  Z operator() ( A a, B b ) const 
    { 
      Z result;
      sumcubes( &result, a, b );
      return result;
    };

  // The subdomain of A x B where the function is defined.
  bool domain_contains( A a, B b ) const 
    { return ( 1<=a && a<=b ); };
  A a_min() const { return A(1); };
  B b_min() const { return B(1); };
  
  // Calculate the differential  f(a+1,b)-f(a,b), here 
  // ((a+1)^3+b^3) - (a^3+b^3) = 3a^2 + 3a + 1 = 3a(a+1) + 1
  D diff1( A a, B b ) const
    { 
      D d(a);
      ++d;
      d*= a;
      d*= 3;
      ++d;
      return d;
    };

  // Calculate the differential  f(a,b+1)-f(a,b), here
  // (a^3+(b+1)^3) - (a^3+b^3) = 3b^2 + 3b + 1 = 3b(b+1) + 1
  D diff2( A a, B b ) const
    { 
      D d(b);
      ++b;
      d*= b;
      d*= 3;
      ++d;
      return d;
    };

private:
  // This auxiliary function calculates a^3 + b^3
  static void sumcubes( ulllint* dest, ulint a, ulint b );
};

/*-----------------------------------------------------------------------*\
 * Calculate the sum of two cubes : a^3 + b^3
\*-----------------------------------------------------------------------*/
inline void 
Function::sumcubes( ulllint* dest, ulint a, ulint b )
{
#ifndef _asm

  asm("### Calculate the sum  a^3 + b^3 ");
  ullint a2(a); 
  a2*= a;
  ulllint &a3 (*dest); 
  a3= a2; 
  a3*= a; 
  ullint b2(b);
  b2*= b;
  ulllint b3(b2); 
  b3*= b; 
  a3+= b3;
  
#else
  
  asm volatile
    ( "### Calculate the sum  a^3 + b^3 \n\t"
      "### %%edi holds the destination address \n\t"
      "### %1 holds the value of a \n\t"
      "### %2 holds the value of b \n\n\t"
      
      "### calculate the cube of %%ecx in %%ebx,%%eax,%%edx \n\t"
      "movl %1,%%ecx       \n\t"
      "movl %%ecx,%%eax    \n\t"
      "mull %%ecx          \n\t"
      "movl %%edx,%%esi    \n\t"
      "mull %%ecx          \n\t"
      "movl %%eax,%%ebx    \n\t"
      "movl %%esi,%%eax    \n\t"
      "movl %%edx,%%esi    \n\t"
      "mull %%ecx          \n\t"
      "addl %%esi,%%eax    \n\t"
      "adcl $0,%%edx       \n\t"
      
      "### store the first cube in the destination \n\t"
      "movl %%ebx,(%%edi)  \n\t"
      "movl %%eax,4(%%edi) \n\t"
      "movl %%edx,8(%%edi) \n\t"
      
      "### calculate the cube of %%ecx in %%ebx,%%eax,%%edx \n\t"
      "movl %2,%%ecx       \n\t"
      "movl %%ecx,%%eax    \n\t"
      "mull %%ecx          \n\t"
      "movl %%edx,%%esi    \n\t"
      "mull %%ecx          \n\t"
      "movl %%eax,%%ebx    \n\t"
      "movl %%esi,%%eax    \n\t"
      "movl %%edx,%%esi    \n\t"
      "mull %%ecx          \n\t"
      "addl %%esi,%%eax    \n\t"
      "adcl $0,%%edx       \n\t"
      
      "### add the second cube to the destination \n\t"
      "addl %%ebx,(%%edi)  \n\t"
      "adcl %%eax,4(%%edi) \n\t"
      "adcl %%edx,8(%%edi)"
      
      : /* output */
      : "D" (dest), "m" (a), "m" (b) /* input */
      : "%eax", "%ebx", "%ecx", "%edx", "%esi", "memory", "cc" /* clobbered */ );
  
#endif
};

/*=======================================================================*\
 * end of file "sumcubes.cc"
\*=======================================================================*/


