// Copyright Mirela Andronescu 2005-2008
// Predicts and outputs minimum free energy secondary structures with new parameters;
//  also outputs the accuracy of the new predictions
// Takes as input:
//  - a set of new parameters
//  - a set of sequences with known and initial structures
//  Outputs:
//  - a file with sequences, known and previously predicted structures, and the newly predicted structures
//  - a file with the sensitivity and positive predictive values for the newly predicted structures    

// You need the following functions (see below for details):
//  - a function that takes as input a file with a set of parameters, and fills your internal data structures
//  - a function that predicts the minimum free energy (or low free energy) secondary structure for a given sequence
//  - if your data has secondary structure constraints, you need a variant of the
//          prediction function to accomodate constraints
//  - a function that computes the sensitivity of prediction
//  - a function that computes the positive predictive value of the prediction

///////////////////////////////////////////////////////////////////
// MODIFY THIS PART 

// Include your own header files
#include "simfold.h"
#include "common.h"
#include "params.h"
#include "externs.h"

// Make sure this number for the maximum length of the sequences is large enough
#define MAXLEN      5000    // maximum length of sequences

// Initialize the data, fill the parameter data structures
// You can write anything here
// "\" separates lines in the define directive
#define INITIALIZE \
    char config_file[200] = "../../../MultiRNAFold/params/multirnafold.conf";\
    init_data (argv[0], config_file, RNA, 37.0);
      
// Provide a function that takes as input a file with a set of parameters, and fills your internal data structures
// Example: void fill_data_structures_with_new_parameters (char *parameters_file)
// The parameters_file should have one parameter value per line.
// For example, if you have 3 parameters in your model, with values 1.00, 2.00, and 3.00, the file should be:
// 1.00
// 2.00
// 3.00
// (without the "// ", of course).
#define FUNCTION_READ_PARAMETERS_FROM_FILE  fill_data_structures_with_new_parameters

// If there is a chance there might be structures in the training or testing data sets
//  that DO NOT have structural constraints, then set UNCONSTRAINTED_STRUCTURES to 1
#define UNCONSTRAINED_STRUCTURES  1

// If the above is 1, then you have to provide a function that predicts
//  the minimum free energy (or a low free energy) secondary structure. Otherwise ignore this.
#if (UNCONSTRAINED_STRUCTURES == 1)
    // Prototype: double simfold (char *sequence, char *structure)
    //      - sequence is the given RNA sequence to be folded (input parameter)
    //      - structure is the predicted secondary structure (output parameter)
    //      - the function may return the energy, or return nothing, it doesn't matter
    #define FUNCTION_PREDICTOR simfold
#endif

// If there is a chance there might be structures in the training or testing data sets
//  that DO have structural constraints, then set CONSTRAINTED_STRUCTURES to 1
// Note that the training or testing sets might have both constrained and unconstrained structures
#define CONSTRAINED_STRUCTURES  1

// If CONSTRAINED_STRUCTURES is 1, then you have to provide a variant of the prediction function
//  that takes the restricted string as input
// If CONSTRAINED_STRUCTURES is 0, it doesn't matter what you write here
#if (CONSTRAINED_STRUCTURES == 1)
    // Prototype: double simfold_restricted (char *sequence, char *restricted, char *structure)
    //      - restricted is a given restriction string
    //          (dots and parentheses for the restricted parts, _ (underscore) for unrestricted)
    #define FUNCTION_PREDICTOR_CONSTRAINED simfold_restricted
#endif

// Provide a function that computes the sensitivity of the predicted structure vs. the reference structure
// The definition that I use is: number of correctly predicted base pairs / number of true base pairs
//  Note: if number of true base pairs is 0, I return -1.
// Prototype: double compute_sensitivity (char *reference_structure, char *predicted_structure)
//      - reference_structure is the reference structure, considered ground truth
//      - predicted_structure is some predicted structure
//      - returns sensitivity in [0,1] or -1 if denominator is 0
#define FUNCTION_COMPUTE_SENSITIVITY compute_sensitivity

// Provide a function that computes the positive predictive value (ppv) of the predicted structure
//  vs. the reference structure
// The definition that I use is: number of correctly predicted base pairs / number of predicted base pairs
//  Note: if number of predicted base pairs is 0, I return -1.
// Prototype: double compute_ppv (char *reference_structure, char *predicted_structure)
//      - reference_structure is the reference structure, considered ground truth
//      - predicted_structure is some predicted structure
//      - returns ppv in [0,1] or -1 if denominator is 0
#define FUNCTION_COMPUTE_PPV compute_ppv

// PLEASE LET ME KNOW IF YOU WANT TO USE OTHER ACCURACY FUNCTIONS

// That's it, please let me know if it doesn't compile
// DO NOT MODIFY ANYTHING FROM NOW ON!
///////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>


int main (int argc, char *argv[])
{
    if (argc != 5)
    {
        printf ("Usage: %s <input_new_params_file> <input_old_set_file> <output_new_set_file> <output_accuracy_file> \n", argv[0]);
        exit(0);
    }    

    char sequence[MAXLEN];
    char known_structure[MAXLEN];
    char restricted[MAXLEN];
    char initial_structure[MAXLEN];
    char new_structure[MAXLEN];
    double energy;
    char new_params_filename[200];
    char oldset_filename[200];
    char newset_filename[200];
    FILE *oldset_file, *newset_file;
    int i, j, k;
    char buffer[MAXLEN];
    
    char accuracy_output_filename[500];
    FILE *accuracy_output_file;
    
    strcpy (new_params_filename, argv[1]);
    strcpy (oldset_filename, argv[2]);
    strcpy (newset_filename, argv[3]);
    strcpy (accuracy_output_filename, argv[4]);

    INITIALIZE;

    FUNCTION_READ_PARAMETERS_FROM_FILE (new_params_filename);

    j = 0;
    double sens_initial_train, sens_new_train;
    double ppv_initial_train, ppv_new_train;    

  //////////////////////////////////////////////////////
  // open the old set, to get the sequence, the true structure and the turner structure
    if ((oldset_file = fopen(oldset_filename, "r")) == NULL)
    {
        printf ("Cannot open file %s\n", oldset_filename);
        exit(0);
    }
    
    if ((newset_file = fopen(newset_filename, "w")) == NULL)
    {
        printf ("Cannot open file %s\n", newset_filename);
        exit(0);
    }

    if ((accuracy_output_file = fopen(accuracy_output_filename, "w")) == NULL)
    {
        printf ("Cannot open file %s\n", accuracy_output_filename);
        exit(0);
    }    
    
    //the format is:
    //    > header
    //    sequence
    //    known structure
    //    [restricted]
    //    initial structure
    //    pred structure 1
    //    pred structure 2 etc
    //    empty line    
    
    // read the header line here
    fgets (buffer, sizeof(buffer), oldset_file);    // header "> SSTRAND_ID etc"
    while (!feof(oldset_file))
    {              
        fprintf (newset_file, "%s", buffer);
        fgets (sequence, sizeof(buffer), oldset_file);    // sequence
        fprintf (newset_file, "%s", sequence);
        fgets (known_structure, sizeof(buffer), oldset_file);    // known structure
        fprintf (newset_file, "%s", known_structure);
        // next one is either restricted or initial structure
        fgets (buffer, sizeof(buffer), oldset_file);    // header "> SSTRAND_ID etc"
        if ( strstr (buffer, "_") != NULL)    // it was the restricted line
        {                        
            strcpy (restricted, buffer);
            fprintf (newset_file, "%s", buffer);
            fgets (initial_structure, sizeof(buffer), oldset_file);    // empty line
            fprintf (newset_file, "%s", initial_structure);
        }        
        else    // it was initial_structure
        {
            restricted[0] = '\0';
            strcpy (initial_structure, buffer);
            fprintf (newset_file, "%s", initial_structure);
        }        
        for (i=0; i < strlen (sequence); i++)
        {
            if(sequence[i] == '\n')
            {
                sequence[i] = '\0';
                known_structure[i] = '\0';
                initial_structure[i] = '\0';
                break;
            }
        }
        for (i=0; i < strlen (restricted); i++)
        {
            if(restricted[i] == '\n')  restricted[i] = '\0';
        }                                     
        
        // predict the new structure, write it in the new set file, and measure its accuracy        
        
        //printf ("\nS: %s\nR: %s\n", sequence, restricted);
        if (strcmp (restricted, "") == 0)
        {
            if (UNCONSTRAINED_STRUCTURES == 0)
            {
                printf ("ERROR! constant UNCONSTRAINED_STRUCTURES is 0, but there are UNrestricted structures in your data set!\n");
                exit(1);
            }
            #if (UNCONSTRAINED_STRUCTURES == 1)
            energy = FUNCTION_PREDICTOR (sequence, new_structure);
            #endif
        }
        else
        {
            if (CONSTRAINED_STRUCTURES == 0)
            {
                printf ("ERROR! constant CONSTRAINED_STRUCTURES is 0, but there are restricted structures in your data set!\n");
                exit(1);
            }
            #if (CONSTRAINED_STRUCTURES == 1)
            energy = FUNCTION_PREDICTOR_CONSTRAINED (sequence, restricted, new_structure);
            #endif
        }                
        //printf ("N: %s\n", new_structure);

        // add the new structure into the file only if it is different from known_structure AND from any other structure generated so far.
        int to_add = 1;
        
        // copy the rest of the structures into the new set file
        while (!feof(oldset_file))
        {
            fgets (buffer, sizeof(buffer), oldset_file);    // some other structure
            if (buffer[0] == '\n') break;
            buffer [strlen (sequence)] = '\0';
            fprintf (newset_file, "%s\n", buffer);          
            if (strcmp (new_structure, buffer) == 0)
                to_add = 0;
        }
        if (strcmp (new_structure, known_structure) == 0)
            to_add = 0;
        
        if (to_add)            
            fprintf (newset_file, "%s\n", new_structure);
        fprintf (newset_file, "\n");            
        fflush (newset_file);

        sens_initial_train = FUNCTION_COMPUTE_SENSITIVITY (known_structure, initial_structure);
        ppv_initial_train = FUNCTION_COMPUTE_PPV (known_structure, initial_structure);
        sens_new_train = FUNCTION_COMPUTE_SENSITIVITY (known_structure, new_structure);
        ppv_new_train = FUNCTION_COMPUTE_PPV (known_structure, new_structure);

        fprintf (accuracy_output_file, "Initial set sens: %.3lf, Initial set ppv: %.3lf, New set sens: %.3lf, New set ppv: %.3lf\n", sens_initial_train, ppv_initial_train, sens_new_train, ppv_new_train);
        
        j++;
        
        fgets (buffer, sizeof(buffer), oldset_file);    // header "> SSTRAND_ID etc"
    }
    fclose (oldset_file);
    fclose (newset_file);
    fclose (accuracy_output_file);

    return 0;  
}


