// Copyright Mirela Andronescu 2005-2008
// Takes as input a parameter set, a file set*.txt, with sequence and various predictions
//  (including predictions with the latest parameter set), and creates linear constraints for CPLEX

// Functions that you need for this file:
//  - A function that returns the number of parameters used in your model
//  - A function that takes as input a file with a set of parameters, and fills your internal data structures
//  - A function that computes how many times each feature occurs in a given structure and the free energy
//  - [optionally] A function that computes the free energy (under your model) of a sequence folded into a given structure.

///////////////////////////////////////////////////////////////////
// MODIFY THIS PART 

// include your own header files
#include "simfold.h"
#include "common.h"
#include "externs.h"
#include "params.h"

// Make sure this number for the maximum length of the sequences is large enough
#define MAXLEN      5000    // maximum length of sequences

// Initialize the data, fill the parameter data structures
// You can write anything here
// "\" separates lines in the define directive
// If you have nothing to initialize, leave it blank:        #define INITIALIZE
#define INITIALIZE \
    char config_file[200] = "../../../MultiRNAFold/params/multirnafold.conf";\
    init_data (argv[0], config_file, RNA, 37.0);
    //fill_data_structures_with_new_parameters ("../../../MultiRNAFold/params/turner_parameters_fm363_constrdangles.txt");

    
// Provide a function that takes as input a file with a set of parameters, and fills your internal data structures
// Prototype: void fill_data_structures_with_new_parameters (char *parameters_file)
// The parameters_file should have one parameter value per line.
// For example, if you have 3 parameters in your model, with values 1.00, 2.00, and 3.00, the file should be:
// 1.00
// 2.00
// 3.00
// (without the "// ", of course).
#define FUNCTION_READ_PARAMETERS_FROM_FILE  fill_data_structures_with_new_parameters


// Provide the name of a function that returns the number of parameters in your model
// Prototype: int get_num_params()
#define FUNCTION_GET_NUMBER_OF_PARAMETERS  get_num_params


// Specify whether your energy function is LINEAR or QUADRATIC
// *  If your energy function is LINEAR in the energy parameters,
//    then your energy function can be written like this:
//    - deltaG = c' x + f
//        - where x is the vector of parameters
//        - c is the vector of how many times each parameter occurs in the given structure
//        - c' means c transposed
//        - f is a constant
//    - As a simple example, if your model has 3 parameters x1, x2 and x3, and deltaG for some structure is
//      x1 + x3 + x3 + 0.5, then c'=(1,0,2) and f=0.5;
//    - The Turner model underlying mfold, simfold, RNAstructure, RNA Vienna package etc is linear.
// * If your energy function is QUADRATIC in the energy parameters, 
//   then your energy function can be written like this:
//    - deltaG = x' P x + c' x + f
//        - where x is the vector of parameters
//        - P is a symmetric matrix of the coefficients for each quadratic term
//        - c is a vector of counts for each linear term
//        - c' means c transposed
//        - f is a constant
//    - As a simple example, if your model has 3 parameters x1, x2 and x3, and deltaG for some structure is
//      x1*x2 + 0.5*x2*x3 + 2*x2, then P = (0,1,0; 1,0,0.5; 0,0.5,0), c' = (0,2,0) and f=0.
//    - The Dirks&Pierce and Rivas&Eddy models for pseudoknotted structures are quadratic.
#define ENERGY_FUNCTION_DEGREE  LINEAR
//#define ENERGY_FUNCTION_DEGREE  QUADRATIC


// Provide the name of a function that returns the number of occurences (counts) for each feature (see below).
// * If ENERGY_FUNCTION_DEGREE above is LINEAR, the prototype is:
//   double get_feature_counts (char *sequence, char *structure, double *linear_vector, double &free_value);
//      - sequence is a given sequence (input parameter)
//      - structure is a given secondary structure (input parameter)
//      - linear_vector is the c vector above, an array of doubles (output parameter).
//          The function writes the number of occurences for each feature in this vector,
//          for example linear_vector[0] is the number of times the first feature occurs
//      - free_value is the free parameter f above.
//      - assume the memory space was allocated for all input and output variables
//      - return the free_energy of sequence folded into structure. Free_energy should be = c' x + f.
// * If ENERGY_FUNCTION_DEGREE above is QUADRATIC, the prototype is:
//   double get_feature_counts_quadratic (char *sequence, char *structure, double **quadratic_matrix, double *linear_vector, double &free_value);
//      - quadratic_matrix is the P matrix above. Assume the space for it is already allocated.
//        Since the matrix is symmetric, we need only the upper triangle, so only fill it for i <= j.
//      - linear_vector is the c vector above. Assume the space for it is already allocated.
//      - free_value is the free parameter f above.
//      - return the free_energy of sequence folded into structure. Free_energy should be = x' P x + c' x + f.
#define FUNCTION_GET_FEATURE_COUNTS   get_feature_counts
//#define FUNCTION_GET_FEATURE_COUNTS   get_feature_counts_quadratic


// If you have another function which computes the free energy of a sequence folded into a given structure,
//      it is a good idea to check whether FUNCTION_GET_FEATURE_COUNTS returns the same value
// First set HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION is that's the case
#define HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION 1

// If the above is 1, then provide the free energy function
//  If the above is 0, leave this unchanged.
// Prototype: double free_energy_simfold (char *sequence, char *structure);
//      returns the free energy (in kcal/mol) of sequence folded into structure
#if (HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION == 1)
#define FUNCTION_FREE_ENERGY free_energy_simfold
#endif 

// That's it, please let me know if it doesn't compile
// DO NOT MODIFY ANYTHING FROM NOW ON!
///////////////////////////////////////////////////////////////////

// standard header files
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#define LINEAR  1
#define QUADRATIC 2


int read_params_from_file (char *filename, double *params)
// read parameters (one per row) in filename, and fill the params array.
// return the number of parameters
{
    FILE *file;
    int i = 0;
    char buffer [500];
    if ((file = fopen (filename, "r")) == NULL)
    {
        printf ("Cannot open file %s, ABORT!\n", filename);
        exit(1);
    }
    fgets (buffer, sizeof(buffer), file);
    while (!feof (file))
    {
        sscanf (buffer, "%lf\n", &params[i++]);
        fgets (buffer, sizeof(buffer), file);
    }    
    fclose(file);
    return i;
}


void reset_linear (int num_parameters, double *array)
{
    for (int i=0; i < num_parameters; i++)
    {
        array[i] = 0;
    }
}

void reset_quadratic (int num_parameters, double **array)
{
    for (int i=0; i < num_parameters; i++)
    {
        for (int j=i; j < num_parameters; j++)
        {
            array[i][j] = 0;
        }
    }
}

#define PARAMTYPE   double  

int main(int argc, char *argv[])
{
    if (argc != 5)
    {
        printf ("Usage: %s <parameter_file> <input_set_file> <output_constraints_file> <output_num_constraints_permol_filename> \n", argv[0]);
        exit(0);
    }

    // variables
    FILE *set_file;
    FILE *constraints_file;
    FILE *num_constraints_permol_file;
    char new_params_filename[500];
    char set_filename[500];
    char constraints_filename[500];
    char num_constraints_permol_filename[500];    
    int i, j;
    char sequence[MAXLEN];
    char structure_known[MAXLEN];
    char restricted[MAXLEN];
    char structure_pred[MAXLEN];
    char buffer [5000];
    char header [MAXLEN];
    double *linear_vector_known;
    double *linear_vector_pred;
    double free_value_known;
    double free_value_pred;
    char constraint[50000];        
    int num_constraints_permol[30000];
    for (i=0; i < 30000; i++)
    {
        num_constraints_permol[i] = 0;
    }
    int num_molecules = 0;
    double accuracy_threshold = 1.0;    // include only those constraints which have accuracy < threshold    
    double energy_fe, energy_this;
    int replace_instead_of_add = 0;     // replacing the old constraints gives very poor results, so this should be 0
    char parameter_filename[500];
    double alt_energy;

    // get the input arguments
    strcpy (new_params_filename, argv[1]);
    strcpy (set_filename, argv[2]);
    strcpy (constraints_filename, argv[3]);
    strcpy (num_constraints_permol_filename, argv[4]);

    // In simfold, I need to initialize the thermodynamic parameter once
    INITIALIZE;    

    // fill the internal data structures with the new parameters
    FUNCTION_READ_PARAMETERS_FROM_FILE (new_params_filename);
    
    // I need to know how many parameters there are in the model
    int num_parameters = FUNCTION_GET_NUMBER_OF_PARAMETERS();

    double *params_array;
    params_array = new double[num_parameters];
    int n = read_params_from_file (new_params_filename, params_array);
    if (n != num_parameters)
    {
        printf ("ERROR! Number of parameters in file %s is %d and is NOT the same as what the function %s reported: %d!\n",
            new_params_filename, n, FUNCTION_GET_NUMBER_OF_PARAMETERS, num_parameters);
        exit(1);
    }
    // now params_array is filled with the values in the given parameter file

    // Allocate the space for the counters
    linear_vector_known = new double[num_parameters];
    if (linear_vector_known == NULL)
    {
        printf ("ERROR! Space could not be allocated for linear_vector_known, ABORT!\n");
        exit(1);
    }
    linear_vector_pred = new double[num_parameters];
    if (linear_vector_pred == NULL)
    {
        printf ("ERROR! Space could not be allocated for linear_vector_pred, ABORT!\n");
        exit(1);
    }

    // allocate the two dimensional array if applicable
    #if (ENERGY_FUNCTION_DEGREE == QUADRATIC)

    double **quadratic_matrix_known;
    quadratic_matrix_known = new double*[num_parameters];
    if (quadratic_matrix_known == NULL)
    {
        printf ("ERROR! Space could not be allocated for quadratic_matrix_known, ABORT!\n");
        exit(1);
    }
    for (i=0; i < num_parameters; i++)
    {
        // I tried to allocate only i+1 elements, but it crashes during delete - dunno why
        quadratic_matrix_known[i] = new double[num_parameters];
        if (quadratic_matrix_known[i] == NULL)
        {
            printf ("ERROR! Space could not be allocated for quadratic_matrix_known[%d], ABORT!\n", i);
            exit(1);
        }
    }
    double **quadratic_matrix_pred;
    quadratic_matrix_pred = new double*[num_parameters];
    if (quadratic_matrix_pred == NULL)
    {
        printf ("ERROR! Space could not be allocated for quadratic_matrix_pred, ABORT!\n");
        exit(1);
    }
    for (i=0; i < num_parameters; i++)
    {
        // I tried to allocate only i+1 elements, but it crashes during delete - dunno why
        quadratic_matrix_pred[i] = new double[num_parameters];
        if (quadratic_matrix_pred[i] == NULL)
        {
            printf ("ERROR! Space could not be allocated for quadratic_matrix_known[%d], ABORT!\n", i);
            exit(1);
        }
    }    
    #endif

    
    
    // open the files
    if ((constraints_file = fopen (constraints_filename,"w")) == NULL)
    {
        printf ("ERROR: Cannot open file %s\n", constraints_filename);
        return 0;
    }    
    if ((set_file = fopen (set_filename, "r")) == NULL)
    {
        printf ("ERROR: Cannot open file %s\n", set_filename);
        exit (0);
    }
    
    int k = 0;
    int delta_index = 0;

    //the format is:
    //    > header
    //    sequence
    //    known structure
    //    [restricted]
    //    pred structure 1
    //    pred structure 2
    //    empty line

    // start reading the input set file
    fgets (header, sizeof(buffer), set_file);    // header "> SSTRAND_ID etc"
    while (!feof(set_file))
    {              
        fgets (sequence, sizeof(buffer), set_file);        // sequence
        fgets (structure_known, sizeof(buffer), set_file);    // known structure
        fgets (buffer, sizeof(buffer), set_file);    // predicted structure or restricted line

        for (j=0; j < sizeof (buffer); j++)
        {
            if (j >= MAXLEN)
            {
                printf ("ERROR! MAXLEN=%d is too small!\n", MAXLEN);
                printf ("%s\n", buffer);
                exit(1);
            }
            if (buffer[j] == '\n')   { buffer[j] = '\0';  break; }
        }
        if ( strstr (buffer, "_") != NULL)    // it was the restricted line
        {                        
            strcpy (restricted, buffer);
            fgets (structure_pred, sizeof(buffer), set_file);    
        }        
        else    // it was structure pred
        {
            restricted[0] = '\0';
            strcpy (structure_pred, buffer);
        }
                  
        for (j=0; j < sizeof(buffer); j++)
        {
            if (j >= MAXLEN)
            {
                printf ("ERROR! MAXLEN=%d is too small!\n", MAXLEN);
                printf ("%s\n", sequence);
                exit(1);
            }            
            if(sequence[j] == '\n')
            {
                sequence[j] = '\0';
                structure_known[j] = '\0';
                structure_pred[j] = '\0';
                break;
            }
        }
        for (j=0; j < strlen (restricted); j++)   if(restricted[j] == '\n')  restricted[j] = '\0';
        for (j=0; j < strlen (header); j++)       if(header[j] == '\n') header[j] = '\0';
      
        //printf ("\n%s\n%s\n%d\n", sequence, structure_known, delta_index);
        
        // now I have the sequence and known structure
        
        // Get the counts for the known structure
        // First reset the counters
        reset_linear (num_parameters, linear_vector_known);
        #if (ENERGY_FUNCTION_DEGREE == LINEAR)
        energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_known, linear_vector_known, free_value_known);
        if (!check_counts_linear (num_parameters, params_array, linear_vector_known, free_value_known, energy_fe))
        {
            // message is printed from check_counts_linear
            printf ("%s\n%s\n", sequence, structure_known);
            exit(1);
        }
        #elif (ENERGY_FUNCTION_DEGREE == QUADRATIC)
        reset_quadratic (num_parameters, quadratic_matrix_known);
        energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_known, quadratic_matrix_known, linear_vector_known, free_value_known);
        if (!check_counts_quadratic (num_parameters, params_array, quadratic_matrix_known, linear_vector_known, free_value_known, energy_fe))
        {
            // message is printed from check_counts_linear
            printf ("%s\n%s\n", sequence, structure_known);
            exit(1);
        }        
        #else
        printf ("ERROR! Constant ENERGY_FUNCTION_DEGREE should be either LINEAR or QUADRATIC! Abort.\n");
        exit(1);
        #endif

        // also check if energy_fe is the same as the one provided separately
        #if (HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION == 1)
        alt_energy = FUNCTION_FREE_ENERGY (sequence, structure_known);
        if (fabs (alt_energy - energy_fe) > 0.1)
        {
            printf ("dG by %s is different from dG by %s: %.2lf vs. %.2lf for \n%s\n%s\n",
                FUNCTION_FREE_ENERGY, FUNCTION_GET_FEATURE_COUNTS, alt_energy, energy_fe, sequence, structure_known);
            exit(1);
        }
        #endif         
       
        // Read all next predicted structures, up to empty line
        // The first structure pred has been read already above.
        // This is the structure predicted with the initial parameters
        while (!feof (set_file))
        {                      
            if (structure_pred[0] == '\n') break;
            structure_pred[strlen(sequence)] = '\0';
            //printf ("%s\n", structure_pred); fflush (stdout);                        
            
            // ignore it if the predicted structure is the same as the known structure
            if (strcmp(structure_known, structure_pred) == 0)
            {
                fgets (structure_pred, sizeof(buffer), set_file);
                continue;
            }
                
            // Get the counts for the predicted structure     
            reset_linear (num_parameters, linear_vector_pred);       
            #if (ENERGY_FUNCTION_DEGREE == LINEAR)
            energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_pred, linear_vector_pred, free_value_pred);
            if (!check_counts_linear (num_parameters, params_array, linear_vector_pred, free_value_pred, energy_fe))
            {
                // message is printed from check_counts_linear
                printf ("%s\n%s\n", sequence, structure_pred);
                exit(1);
            }

            #elif (ENERGY_FUNCTION_DEGREE == QUADRATIC)
            reset_quadratic (num_parameters, quadratic_matrix_pred);
            energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_pred, quadratic_matrix_pred, linear_vector_pred, free_value_pred);
            if (!check_counts_quadratic (num_parameters, params_array, quadratic_matrix_pred, linear_vector_pred, free_value_pred, energy_fe))
            {
                // message is printed from check_counts_linear
                printf ("%s\n%s\n", sequence, structure_pred);
                exit(1);
            }                    
            #else
            printf ("ERROR! Constant ENERGY_FUNCTION_DEGREE should be either LINEAR or QUADRATIC! Abort.\n");
            exit(1);
            #endif

            // also check if energy_fe is the same as the one provided separately
            #if (HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION == 1)
            alt_energy = FUNCTION_FREE_ENERGY (sequence, structure_pred);
            if (fabs (alt_energy - energy_fe) > 0.1)
            {
                printf ("dG by %s is different from dG by %s: %.2lf vs. %.2lf for \n%s\n%s\n",
                    FUNCTION_FREE_ENERGY, FUNCTION_GET_FEATURE_COUNTS, alt_energy, energy_fe, sequence, structure_pred);
                exit(1);
            }
            #endif               
        
            // print sequence and structures as comments, but they shouldn't be longer than some 500 or so.
            char tmps[90];
            int len = strlen(sequence);
            int index = 0;
            int wl = 80;
            //fprintf (problem, "\n    \\\\ %s", header);
            while (index < len)
            {
                strncpy (tmps, sequence+index, wl);
                index += wl;
            }
            index = 0;
            while (index < len)
            {
                strncpy (tmps, structure_known+index, wl);
                index += wl;
            }
            index = 0;
            while (index < len)
            {
                strncpy (tmps, structure_pred+index, wl);
                index += wl;
            }

//             if (show_comments)
//             {
//                 fprintf (constraints_file, "    \\* %s *\\\n", sequence);
//                 fprintf (constraints_file, "    \\* %s *\\\n", structure_known);
//                 fprintf (constraints_file, "    \\* %s *\\\n", structure_pred);
//                 fprintf (constraints_file, "    \\* ");
//             }
            sprintf (constraint, "    sc%d: ", delta_index);    // structural constraints
            int l=0;
            
            #if (ENERGY_FUNCTION_DEGREE == QUADRATIC)            
            for (i=0; i < num_parameters; i++)
            {
                for (j=i; j < num_parameters; j++)
                {
                    if (quadratic_matrix_known[i][j] - quadratic_matrix_pred[i][j] != 0)
                    {
                        if (l > 0 && quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j] > 0)
                        {
                            sprintf (constraint, "%s + ", constraint);
                        }
                        if (quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j] == 1)
                        {
                            sprintf (constraint, "%s x%d x%d ", constraint, i, j);
                        }
                        else if (quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j] == -1)
                        {
                            sprintf (constraint, "%s - x%d x%d ", constraint, i, j);
                        }
                        else
                        {
                            sprintf (constraint, "%s %g x%d x%d ", constraint, quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j], i, j);
                        }
                        l++;
                        if (l % 8 == 0)
                        {
                            sprintf (constraint, "%s\n        ", constraint);
                        }                        
                    }
                }
            }
            #endif
            
            for (i=0; i < num_parameters; i++)
            {
                if (linear_vector_known[i]-linear_vector_pred[i] != 0)
                {
                    if (l > 0 && linear_vector_known[i]-linear_vector_pred[i] > 0)
                    {
                        sprintf (constraint, "%s + ", constraint);
//                         if (show_comments)
//                             fprintf (constraints_file, " + ");
                    }
                    if (linear_vector_known[i]-linear_vector_pred[i] == 1)
                    {
                        sprintf (constraint, "%s x%d ", constraint, i);
//                         if (show_comments)
//                             fprintf (constraints_file, " %s ", string_params[i]);
                    }
                    else if (linear_vector_known[i]-linear_vector_pred[i] == -1)
                    {
                        sprintf (constraint, "%s - x%d ", constraint, i);
//                         if (show_comments)
//                             fprintf (constraints_file, " - %s ", string_params[i]);
                    }
                    else
                    {
                        sprintf (constraint, "%s %g x%d ", constraint, linear_vector_known[i]-linear_vector_pred[i], i);
//                         if (show_comments)
//                             fprintf (constraints_file, " %.1lf %s ", linear_vector_known[i]-linear_vector_pred[i], string_params[i]);
                    }
                    l++;
                    if (l % 10 == 0)
                    {
                        sprintf (constraint, "%s\n        ", constraint);
//                         if (show_comments)
//                             fprintf (constraints_file, " *\\\n    \\*     ");
                    }
                }
            }

//             if (show_comments)
//                 fprintf (constraints_file, " *\\\n");

            // no idea what the difference here is
            if (replace_instead_of_add)
            {
                sprintf (constraint, "%s - d%d < %.2lf\n\n", constraint, delta_index, free_value_pred-free_value_known);
            }
            else
            {
                sprintf (constraint, "%s - d%d < %.2lf\n\n", constraint, delta_index, free_value_pred-free_value_known);
            }

            fprintf (constraints_file, "%s", constraint);
            delta_index++;

            num_constraints_permol[num_molecules]++;

            fgets (structure_pred, sizeof(buffer), set_file);
        }    // end while structures for this sequence
                
        num_molecules++;
        fgets (header, sizeof(buffer), set_file);    // header "> SSTRAND_ID etc"
    }   // end big while 
    
    fclose (constraints_file);
    fclose (set_file);
        
    // now write the number of constraints per molecule in a file
    if ((num_constraints_permol_file = fopen (num_constraints_permol_filename,"w")) == NULL)
    {
        printf ("ERROR: Cannot open file %s\n", num_constraints_permol_filename);
        return 0;
    }  
    for (i=0; i < num_molecules; i++)
        fprintf (num_constraints_permol_file, "%d\n", num_constraints_permol[i]);
    fclose (num_constraints_permol_file);

    delete [] linear_vector_known;
    delete [] linear_vector_pred;

    #if (ENERGY_FUNCTION_DEGREE == QUADRATIC)
    for (i=0; i < num_parameters; i++)
    {
        delete [] quadratic_matrix_known[i];
        delete [] quadratic_matrix_pred[i];
    }
    delete [] quadratic_matrix_known;
    delete [] quadratic_matrix_pred;
    #endif

    delete [] params_array;
    
    return 0;
}



