// Copyright Mirela Andronescu 2005-2008
// Predicts and outputs minimum free energy secondary structures with initial parameters;
// Takes as input:
//  - a file with the set of initial parameters
//  - a file with the set of sequences with known structures
//  Outputs:
//  - a file with sequences, known structures, and the predicted structures with the initial set of parameters

// You need the following functions (see below for details):
//  - a function that takes as input a file with a set of parameters, and fills your internal data structures
//  - a function that predicts the minimum free energy (or low free energy) secondary structure for a given sequence
//  - if your data has secondary structure constraints, you need a variant of the
//          prediction function to accomodate constraints

///////////////////////////////////////////////////////////////////
// MODIFY THIS PART 

// Include your own header files
#include "simfold.h"
#include "common.h"
#include "params.h"
#include "externs.h"

// Make sure this number for the maximum length of the sequences is large enough
#define MAXLEN      5000    // maximum length of sequences

// Initialize the data, fill the parameter data structures
// You can write anything here
// "\" separates lines in the define directive
#define INITIALIZE \
    char config_file[200] = "../../../MultiRNAFold/params/multirnafold.conf";\
    init_data (argv[0], config_file, RNA, 37.0);
      
// Provide a function that takes as input a file with a set of parameters, and fills your internal data structures
// Example: void fill_data_structures_with_new_parameters (char *parameters_file)
// The parameters_file should have one parameter value per line.
// For example, if you have 3 parameters in your model, with values 1.00, 2.00, and 3.00, the file should be:
// 1.00
// 2.00
// 3.00
// (without the "// ", of course).
#define FUNCTION_READ_PARAMETERS_FROM_FILE  fill_data_structures_with_new_parameters

// If there is a chance there might be structures in the training or testing data sets
//  that DO NOT have structural constraints, then set UNCONSTRAINTED_STRUCTURES to 1
#define UNCONSTRAINED_STRUCTURES  1

// If the above is 1, then you have to provide a function that predicts
//  the minimum free energy (or a low free energy) secondary structure. Otherwise ignore this.
#if (UNCONSTRAINED_STRUCTURES == 1)
    // Prototype: double simfold (char *sequence, char *structure)
    //      - sequence is the given RNA sequence to be folded (input parameter)
    //      - structure is the predicted secondary structure (output parameter)
    //      - the function may return the energy, or return nothing, it doesn't matter
    #define FUNCTION_PREDICTOR simfold
#endif

// If there is a chance there might be structures in the training or testing data sets
//  that DO have structural constraints, then set CONSTRAINTED_STRUCTURES to 1
// Note that the training or testing sets might have both constrained and unconstrained structures
#define CONSTRAINED_STRUCTURES  1

// If CONSTRAINED_STRUCTURES is 1, then you have to provide a variant of the prediction function
//  that takes the restricted string as input
// If CONSTRAINED_STRUCTURES is 0, it doesn't matter what you write here
#if (CONSTRAINED_STRUCTURES == 1)
    // Prototype: double simfold_restricted (char *sequence, char *restricted, char *structure)
    //      - restricted is a given restriction string
    //          (dots and parentheses for the restricted parts, _ (underscore) for unrestricted)
    #define FUNCTION_PREDICTOR_CONSTRAINED simfold_restricted
#endif

// That's it, please let me know if it doesn't compile
// DO NOT MODIFY ANYTHING FROM NOW ON!
///////////////////////////////////////////////////////////////////



#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

 
int main (int argc, char *argv[])
{
    char sequence[MAXLEN];
    char new_structure[MAXLEN];
    char known_structure[MAXLEN];
    char restricted[MAXLEN];
    double energy;
    char params_filename[500];    
    char oldset_filename[200];
    char newset_filename[200];
    FILE *oldset_file, *newset_file;
    int i, j;
    char buffer[MAXLEN];

    if (argc != 4)
    {
        printf ("Usage: %s <input_initial_params_file> <input_old_set_file> <output_new_set_file>\n", argv[0]);
        exit(0);
    }
    strcpy (params_filename, argv[1]);
    strcpy (oldset_filename, argv[2]);
    strcpy (newset_filename, argv[3]);

    INITIALIZE;

    FUNCTION_READ_PARAMETERS_FROM_FILE (params_filename);
            
    //////////////////////////////////////////////////////
    // open the old set, to get the sequence, the true structure 
    if ((oldset_file = fopen(oldset_filename, "r")) == NULL)
    {
        printf ("Cannot open file %s\n", oldset_filename);
        exit(0);
    }
    
    if ((newset_file = fopen(newset_filename, "w")) == NULL)
    {
        printf ("Cannot open file %s\n", newset_filename);
        exit(0);
    }
    
    //the format is:
    //    > header
    //    sequence
    //    known structure
    //    [restricted]
    //    empty line        
    
    fgets (buffer, sizeof(buffer), oldset_file);    // header "> SSTRAND_ID etc"    
    while (!feof(oldset_file))
    {      
        fprintf (newset_file, "%s", buffer);
        fgets (sequence, sizeof(buffer), oldset_file);
        fprintf (newset_file, "%s", sequence);
        fgets (known_structure, sizeof(buffer), oldset_file);    // real structure
        fprintf (newset_file, "%s", known_structure);
        
        fgets (buffer, sizeof(buffer), oldset_file);   
        if ( strstr (buffer, "_") != NULL)    // it was the restricted line
        {                        
            strcpy (restricted, buffer);
            fprintf (newset_file, "%s", buffer);
            fgets (buffer, sizeof(buffer), oldset_file);    // empty line
        }        
        else    // it was new_structure
        {
            restricted[0] = '\0';
        }        
        
              
        for (i=0; i < strlen (sequence); i++)
        {
            if(sequence[i] == '\n')
            {
                sequence[i] = '\0';
                known_structure[i] = '\0';
                break;
            }
        }
        for (i=0; i < strlen (restricted); i++)
        {
            if(restricted[i] == '\n')
            {
                restricted[i] = '\0';
            }
        }
        fflush (newset_file);
              

        if (strcmp (restricted, "") == 0)
        {
            if (UNCONSTRAINED_STRUCTURES == 0)
            {
                printf ("ERROR! constant UNCONSTRAINED_STRUCTURES is 0, but there are UNrestricted structures in your data set!\n");
                exit(1);
            }
            #if (UNCONSTRAINED_STRUCTURES == 1)
            energy = FUNCTION_PREDICTOR (sequence, new_structure);
            #endif
        }
        else
        {
            if (CONSTRAINED_STRUCTURES == 0)
            {
                printf ("ERROR! constant CONSTRAINED_STRUCTURES is 0, but there are restricted structures in your data set!\n");
                exit(1);
            }
            #if (CONSTRAINED_STRUCTURES == 1)
            energy = FUNCTION_PREDICTOR_CONSTRAINED (sequence, restricted, new_structure);
            #endif
        }                
              
        fprintf (newset_file, "%s\n\n", new_structure);
        fflush (newset_file);

        fgets (buffer, sizeof(buffer), oldset_file);    // header "> SSTRAND_ID etc"
    }
    fclose (oldset_file);
    fclose (newset_file);    
}


