startup
Class ClassificationAction

java.lang.Object
  extended by startup.ClassificationAction
All Implemented Interfaces:
java.lang.Cloneable, java.lang.Runnable

public class ClassificationAction
extends java.lang.Object
implements java.lang.Runnable, java.lang.Cloneable

This class represents the dataflow that is necessary to either build a classification model for a certain functional site, or evaluate a sequence for said functional site. This class is thus part of the backbone of the FunSiP-program, combining all the available resources and classes to produce valid data. Failure of execution in this class will return the workflow-pointer to the SprWorkflow-object, thus ending the objectives set for this classification action. In other words, the execution of ClassificationAction is as atomic as possible.

Author:
Michiel Van Bel

Nested Class Summary
static class ClassificationAction.CLASSIFIERS
          Choice of different available classifiers.
static class ClassificationAction.COMMAND
          The different dataflow/workflow commands.
static class ClassificationAction.FILE_TYPE
          Possible filetypes which can be used when evaluating sequence files.
static class ClassificationAction.STRAND
          Possible strands which should be evaluated by the classification models.
 
Constructor Summary
ClassificationAction(org.apache.log4j.Logger logger, FunSiPWorkflow parent)
          Constructor for this classification action.
 
Method Summary
 void addClassificationFeature(ClassificationFeature cfi)
           
 void addConversion(Conversion c)
           
 Classifier build_model()
          Method which creates a classification model for the current type of supplied training data.
 Classifier chooseClassifier(ClassificationAction.CLASSIFIERS cl)
          This method creates the right classifier implementation that implements the Classifier interface, according to the chosen enumeration.
 void computeOptimalisationStrings()
          This method is a more or less necessary evil.
 void deleteTrainingFile()
          This method attempts to delete the file that contains all the features used for training the classifier.
 void evaluate_model(Classifier classifier)
          Evaluate the content of sequence files with the supplied classifier (and accompagnied classification model).
 java.io.File extractAndClassify(java.lang.String sequence, Classifier classifier, java.util.List<java.lang.Integer> sites, java.util.List<java.lang.Integer> output_sites, java.lang.String clas_name, java.lang.String fileName, int increase, int increase_output)
          This method extracts features around functional sites in a sequence, and then uses the provided classifier to classify that particular functional site.
 void extractAndEvaluateData(java.io.File file, Classifier classifier, int index)
          Evaluate the content of a specific file with the supplied classification model.
 java.util.List<java.util.List<java.lang.Double>> extractFeaturesFromSequence(java.lang.String sequence, java.lang.String secon, java.lang.Double energy, java.lang.String[] conversion_strings, int splicesite)
          This method is part of the main-loop that extracts the different features.
 void extractFeaturesFromTrainingData(Classifier classifier)
          This method extracts the necessary features in order to train a classifier with these features.
 java.lang.String getClassification_name()
           
 java.util.List<ClassificationFeature> getClassificationFeatures()
           
 java.lang.String[] getClassifier_options()
           
 ClassificationAction.CLASSIFIERS getClassifier_type()
           
 ClassificationAction.COMMAND getCommand()
           
 java.util.List<Conversion> getConversionList()
           
 java.lang.String[] getConversionStrings(java.lang.String sequence)
          This method takes one string as input and produces all the conversions of this string
 int getCrossvalidation_fold()
           
 java.io.File getCrossValidationTabFile()
           
 java.lang.String getFeatures_output_file()
           
 FeatureSelectionStub getFeatureSelection()
           
 java.io.File getFeaturesTrainingData(java.lang.String fileName, java.util.List<java.lang.String> sequences, SecondaryStructureData ssData, Classifier classifier, Classifier.DATA_TYPE data_type)
          This method extracts the features from a trainingfile (positive/negative), and stores the extracted features into the feature_output_file.
 int getMaximum_crossvalidation()
           
 int getMaximumSecondaryDownRange()
          This method returns the maximum downstreamrange of all ClassificationFeatures that implement secondary structure features.
 int getMaximumSecondaryUpRange()
          This method returns the maximum upstreamrange of all ClassificationFeatures that implement secondary structure features.
 java.lang.String getModel_file()
           
 java.lang.String getName()
           
 int getNegative_training_amount()
           
 java.lang.String getNegative_training_file_secstruct()
           
 java.lang.String getNegative_training_file()
           
 java.lang.Double getOutput_fixed_alpha()
           
 java.lang.Double getOutput_fixed_beta()
           
 java.lang.String getOutputdirectory()
           
 int getPattern_classification_location()
           
 int getPattern_classification_output_f()
           
 int getPattern_classification_output_r()
           
 java.lang.String getPattern()
           
 int getPositive_training_amount()
           
 java.lang.String getPositive_training_file_secstruct()
           
 java.lang.String getPositive_training_file()
           
 java.lang.String[] getPredictionOutputFiles()
           
 ClassificationAction.STRAND getStrand()
           
 ClassificationAction.FILE_TYPE getTestfiles_filetype()
           
 java.lang.String[] getTestfiles()
           
 int getTraining_functional_site_position()
           
 boolean isComplexityCrossvalidation()
           
 boolean isPattern_regexp()
           
 Classifier load_model()
          This method loads a precomputed classification model into memory.
 ClassificationAction make_copy()
          This method makes a copy of the current classification action.
 boolean mustExtractSecondaryStructures()
          This method determines whether or not the class should extract the secondary structures, according to the types of classificationfeatures
 void perform_crossvalidation(Classifier classifier, java.io.File featureFile, java.util.List<RocCurveData> rocCurveDataNames, CrossValidationOutput output)
          This method performs the actual crossvalidation.
 void performOptimalComplexityCrossvalidation(Classifier classifier, java.io.File featureFile)
          This method performs the complexity crossvalidation operation.
 void run()
          Starts the dataflow in a asymptotic thread way.
 void setClassification_name(java.lang.String classification_name)
           
 void setClassificationFeatures(java.util.List<ClassificationFeature> newList)
           
 void setClassifier_options(java.lang.String[] classifier_options)
           
 void setClassifier_type(ClassificationAction.CLASSIFIERS classifier_type)
           
 boolean setClassifier_type(java.lang.String s)
           
 void setCommand(ClassificationAction.COMMAND command)
           
 boolean setCommand(java.lang.String s)
           
 void setComplexityCrossvalidation(boolean complexityCrossvalidation)
           
 void setConversionList(java.util.List<Conversion> conversionList)
           
 void setCrossvalidation_fold(int crossvalidation_fold)
           
 void setCrossValidationTabFile(java.io.File crossValidationTabFile)
           
 void setFeatures_output_file(java.lang.String features_output_file)
           
 void setFeatureSelection(FeatureSelectionStub featureSelection)
           
 void setMaximum_crossvalidation(int maximum_crossvalidation)
           
 void setModel_file(java.lang.String file)
           
 void setName(java.lang.String name)
           
 void setNegative_training_amount(int negative_training_amount)
           
 void setNegative_training_file_secstruct(java.lang.String negative_training_file_secstruct)
           
 void setNegative_training_file(java.lang.String negative_training_file)
           
 void setOutput_fixed_alpha(java.lang.Double output_fixed_alpha)
           
 void setOutput_fixed_beta(java.lang.Double output_fixed_beta)
           
 void setOutputdirectory(java.lang.String outputdirectory)
           
 void setPattern_classification_location(int pattern_classification_location)
           
 void setPattern_classification_output_f(int pattern_classification_output_f)
           
 void setPattern_classification_output_r(int pattern_classification_output_r)
           
 void setPattern_regexp(boolean pattern_regexp)
           
 void setPattern(java.lang.String pattern)
           
 void setPositive_training_amount(int positive_training_amount)
           
 void setPositive_training_file_secstruct(java.lang.String positive_training_file_secstruct)
           
 void setPositive_training_file(java.lang.String positive_training_file)
           
 void setPredictionOutputFiles(java.lang.String[] predictionOutputFiles)
           
 void setStrand(ClassificationAction.STRAND strand)
           
 boolean setStrand(java.lang.String s)
           
 void setTestfiles_filetype(ClassificationAction.FILE_TYPE testfiles_filetype)
           
 boolean setTestfiles_filetype(java.lang.String s)
           
 void setTestfiles(java.lang.String[] testfiles)
           
 void setTraining_functional_site_position(int splicesite)
           
 void start_classification()
          This method starts the actual dataflow within the classification action.
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

ClassificationAction

public ClassificationAction(org.apache.log4j.Logger logger,
                            FunSiPWorkflow parent)
Constructor for this classification action.

Parameters:
logger - The logger-object is kept the same throughout the entire workflow, so all logging-activity is done in the same way.
parent - The workflow object that contains this classification action. Each classification action is part of a workflow, and each workflow contains at least one classification action.
Method Detail

make_copy

public ClassificationAction make_copy()
This method makes a copy of the current classification action. This is necessary when multiple copies of this classification action are necessary, for example during optimizations.

Returns:
A copy of the current classification action.

run

public void run()
Starts the dataflow in a asymptotic thread way. Usefull when - for example - the entire workflow is running in a multi-threaded way. The parent workflow is forced to wait until all classification actions defined within its domain, are finished.

Specified by:
run in interface java.lang.Runnable

start_classification

public void start_classification()
This method starts the actual dataflow within the classification action. Depending on the various possible parameters and their values, different types of action will be taken. For example, when the classification action is set to "build model"-modus, the normal course of action is to extract training data, apply feature selection and then build the classification SVM model.


build_model

public Classifier build_model()
Method which creates a classification model for the current type of supplied training data. This process is performed in several stages : a) Features are extracted from all the training data b) Features are written to a temporary file c) If necessary, a feature selection algorithm is applied to the features d) If crossvalidation is selected, then the features are used for this crossvalidation. e) If crossvalidation is not selected, supply the features to the classifier in order to have it make the classification model.

Returns:
The classifier containing the classification model.

computeOptimalisationStrings

public void computeOptimalisationStrings()
This method is a more or less necessary evil. It precomputes arrays that contain all the possible strings (of an alphabet) of a certain length. The correct array is then passed on onto the method that extracts the features (if necessary). By using this approach, we prevent the recomputation of the same array by each invocation of the extract-features method. Using this hack literally cuts feature extraction speed to 1/10th.


perform_crossvalidation

public void perform_crossvalidation(Classifier classifier,
                                    java.io.File featureFile,
                                    java.util.List<RocCurveData> rocCurveDataNames,
                                    CrossValidationOutput output)
This method performs the actual crossvalidation. The method relies on capabilities provided by the classification-object to do the actual calculations.

Parameters:
classifier - The classifier-object used for performing the calculations
featureFile - The file containing the extracted features
rocCurveDataNames - An empty list to be filled with names of roc-curves, derived from the crossvalidation results.
output - Object to which the output of the crossvalidation will be written.

performOptimalComplexityCrossvalidation

public void performOptimalComplexityCrossvalidation(Classifier classifier,
                                                    java.io.File featureFile)
This method performs the complexity crossvalidation operation. This method is used for crossvalidating and optimizing the complexity-constant of the support vector machine at the same time. An exponential row of numbers is used as base for these complexity- constants. The row is going from 2^-10 'till 2^4 (with 4 being the maximum number indicated by the variable set in the configuration file).

Parameters:
classifier - The classifier object.
featureFile - The file containing the extracte features.

load_model

public Classifier load_model()
This method loads a precomputed classification model into memory. This step is necessary to allow the classification of various sequences.

Returns:
The classifier-object that is loaded with the classification model.

evaluate_model

public void evaluate_model(Classifier classifier)
Evaluate the content of sequence files with the supplied classifier (and accompagnied classification model). Each file is inspected independently, so if a failure occurs during the evaluation of one of the sequences, then the evaluation can continue for the other ones. There might arise some problems though during the merging of the results.

Parameters:
classifier - The classifier-object containing the classification model.

extractAndEvaluateData

public void extractAndEvaluateData(java.io.File file,
                                   Classifier classifier,
                                   int index)
                            throws java.lang.Exception
Evaluate the content of a specific file with the supplied classification model. The choice is made between forward strand only, reverse strand only or to combine both forward and reverse strand. The results of the classification are hereafter written to the indicated output file if defined.

Parameters:
file - File containing the sequence to be evaluated.
classifier - The classifier-object containing the
index - The index of the file.
Throws:
java.lang.Exception - Thrown if the evaluation of a certain file goes wrong.

extractAndClassify

public java.io.File extractAndClassify(java.lang.String sequence,
                                       Classifier classifier,
                                       java.util.List<java.lang.Integer> sites,
                                       java.util.List<java.lang.Integer> output_sites,
                                       java.lang.String clas_name,
                                       java.lang.String fileName,
                                       int increase,
                                       int increase_output)
This method extracts features around functional sites in a sequence, and then uses the provided classifier to classify that particular functional site.

Parameters:
sequence - The sequence in which the functional sites are located.
classifier - The classifier with associated classification model that is used for evaluation of the functional sites.
sites - The location of the functional sites in the sequence.
output_sites - The location of the functional sites when remapped to the forward strand (necessary for output).
clas_name - The name used for output for the functional site.
fileName - The name of the file that will contain the results of the evaluation.
increase - An integer that is used to increase/decrease the position of the functional site relative to the location of its identifying pattern (e.g. acceptor AG sites: AG is the pattern, but the acceptor splice site is located after the G).
increase_output - An integer that identifies an additional increase/decrease to the location of the functional site in the output. This is done in order to accomodate for the difference in counting (informaticians start from 0, biologists from 1), so the output can be finetuned enough so no post-processing is needed.
Returns:
The file containing the evaluations of the various functional sites.

extractFeaturesFromTrainingData

public void extractFeaturesFromTrainingData(Classifier classifier)
                                     throws java.lang.Exception
This method extracts the necessary features in order to train a classifier with these features. It extracts the features for all the sequences in both the positive and negative trainingfiles. It then merges the resulting 2 files into 1 file, according to the specifications of the used classifier.

Parameters:
classifier - The classifier to be used.
Throws:
java.lang.Exception

getFeaturesTrainingData

public java.io.File getFeaturesTrainingData(java.lang.String fileName,
                                            java.util.List<java.lang.String> sequences,
                                            SecondaryStructureData ssData,
                                            Classifier classifier,
                                            Classifier.DATA_TYPE data_type)
This method extracts the features from a trainingfile (positive/negative), and stores the extracted features into the feature_output_file. This file is then returned to the parent method. The method's workflow is pretty easy: iterate over all sequences, extract the features for each sequence and then write the features to a file.

Parameters:
fileName - The name of the file that contains the sequences
sequences - A list with sequences
ssData - The secondary structure data
classifier - The selected classifier
data_type - The datatype (positive,negative,unknown).
Returns:
The file with the features

getConversionStrings

public java.lang.String[] getConversionStrings(java.lang.String sequence)
This method takes one string as input and produces all the conversions of this string

Parameters:
sequence - The original DNA/RNA sequence
Returns:
The array containing all the conversions.

extractFeaturesFromSequence

public java.util.List<java.util.List<java.lang.Double>> extractFeaturesFromSequence(java.lang.String sequence,
                                                                                    java.lang.String secon,
                                                                                    java.lang.Double energy,
                                                                                    java.lang.String[] conversion_strings,
                                                                                    int splicesite)
                                                                             throws FeatureExtractionException
This method is part of the main-loop that extracts the different features. This particular method extracts all the features for a given (pseudo-)splice site, by iterating over the list with classificationfeatures, and so by appending the found features (for each classificationfeature) to the final list. Because this iteration should always take place in the same order, all features are also placed in the same order. Of course, care should be taken by the creators of the classificationfeatures to always produce the same amount of features for a given set of parameters. Otherwise the results will of course be meaningless.

Parameters:
sequence - The sequence from which the features should be extracted.
secon - The secondary structure of the sequence
energy - The free energy that is associated with the secondary structure
splicesite - The location of the splicesite, relative to which the upstreamm/downstream parameters are located
Returns:
A double linked list with features (one list for each classificationfeature)
Throws:
FeatureExtractionException - Thrown when the set parameters are in contradiction with the arguments of the methods.

deleteTrainingFile

public void deleteTrainingFile()
This method attempts to delete the file that contains all the features used for training the classifier. Normally this method should always succeed, but we have noticed that - particulary with LIBSVM - there appears to be a lasting file-lock onto the featurefile, which prevents it from being deleted. In order to try to minimize this effect (it seems timing-dependent) we stall the main thread for about 1 second and try to let the java VM release all the locks. Thereafter, two different messages are printed according to whether or not the deletion succeeded.


chooseClassifier

public Classifier chooseClassifier(ClassificationAction.CLASSIFIERS cl)
This method creates the right classifier implementation that implements the Classifier interface, according to the chosen enumeration.

Parameters:
cl - Indication of classifier to be used
Returns:
Instantiation of the correct classifier.

mustExtractSecondaryStructures

public boolean mustExtractSecondaryStructures()
This method determines whether or not the class should extract the secondary structures, according to the types of classificationfeatures

Returns:
boolean indicating whether or not to extract the secondary structure features

getMaximumSecondaryUpRange

public int getMaximumSecondaryUpRange()
This method returns the maximum upstreamrange of all ClassificationFeatures that implement secondary structure features. This method is needed to minimize the computational time of inline RNA folding

Returns:
The maximum upstreamrange of all ClassificationFeatures that implement secondary structure features

getMaximumSecondaryDownRange

public int getMaximumSecondaryDownRange()
This method returns the maximum downstreamrange of all ClassificationFeatures that implement secondary structure features. This method is needed to minimize the computational time of inline RNA folding

Returns:
The maximum downstreamrange of all ClassificationFeatures that implement seondary structure features

getPattern

public java.lang.String getPattern()

setPattern

public void setPattern(java.lang.String pattern)

getPattern_classification_location

public int getPattern_classification_location()

setPattern_classification_location

public void setPattern_classification_location(int pattern_classification_location)

isPattern_regexp

public boolean isPattern_regexp()

setPattern_regexp

public void setPattern_regexp(boolean pattern_regexp)

getNegative_training_amount

public int getNegative_training_amount()

setNegative_training_amount

public void setNegative_training_amount(int negative_training_amount)

getPositive_training_amount

public int getPositive_training_amount()

setPositive_training_amount

public void setPositive_training_amount(int positive_training_amount)

getStrand

public ClassificationAction.STRAND getStrand()

setStrand

public void setStrand(ClassificationAction.STRAND strand)

setStrand

public boolean setStrand(java.lang.String s)

getTestfiles_filetype

public ClassificationAction.FILE_TYPE getTestfiles_filetype()

setTestfiles_filetype

public void setTestfiles_filetype(ClassificationAction.FILE_TYPE testfiles_filetype)

setTestfiles_filetype

public boolean setTestfiles_filetype(java.lang.String s)

getCommand

public ClassificationAction.COMMAND getCommand()

setCommand

public void setCommand(ClassificationAction.COMMAND command)

setCommand

public boolean setCommand(java.lang.String s)

getCrossvalidation_fold

public int getCrossvalidation_fold()

setCrossvalidation_fold

public void setCrossvalidation_fold(int crossvalidation_fold)

getMaximum_crossvalidation

public int getMaximum_crossvalidation()

setMaximum_crossvalidation

public void setMaximum_crossvalidation(int maximum_crossvalidation)

getModel_file

public java.lang.String getModel_file()

setModel_file

public void setModel_file(java.lang.String file)

getFeatures_output_file

public java.lang.String getFeatures_output_file()

setFeatures_output_file

public void setFeatures_output_file(java.lang.String features_output_file)

getTestfiles

public java.lang.String[] getTestfiles()

setTestfiles

public void setTestfiles(java.lang.String[] testfiles)

getNegative_training_file

public java.lang.String getNegative_training_file()

setNegative_training_file

public void setNegative_training_file(java.lang.String negative_training_file)

getOutputdirectory

public java.lang.String getOutputdirectory()

setOutputdirectory

public void setOutputdirectory(java.lang.String outputdirectory)

getPositive_training_file

public java.lang.String getPositive_training_file()

setPositive_training_file

public void setPositive_training_file(java.lang.String positive_training_file)

getTraining_functional_site_position

public int getTraining_functional_site_position()

setTraining_functional_site_position

public void setTraining_functional_site_position(int splicesite)

setName

public void setName(java.lang.String name)

getName

public java.lang.String getName()

setClassificationFeatures

public void setClassificationFeatures(java.util.List<ClassificationFeature> newList)

addClassificationFeature

public void addClassificationFeature(ClassificationFeature cfi)

getClassificationFeatures

public java.util.List<ClassificationFeature> getClassificationFeatures()

getClassification_name

public java.lang.String getClassification_name()

setClassification_name

public void setClassification_name(java.lang.String classification_name)

getClassifier_type

public ClassificationAction.CLASSIFIERS getClassifier_type()

setClassifier_type

public void setClassifier_type(ClassificationAction.CLASSIFIERS classifier_type)

setClassifier_type

public boolean setClassifier_type(java.lang.String s)

getNegative_training_file_secstruct

public java.lang.String getNegative_training_file_secstruct()

setNegative_training_file_secstruct

public void setNegative_training_file_secstruct(java.lang.String negative_training_file_secstruct)

getPositive_training_file_secstruct

public java.lang.String getPositive_training_file_secstruct()

setPositive_training_file_secstruct

public void setPositive_training_file_secstruct(java.lang.String positive_training_file_secstruct)

getFeatureSelection

public FeatureSelectionStub getFeatureSelection()

setFeatureSelection

public void setFeatureSelection(FeatureSelectionStub featureSelection)

getClassifier_options

public java.lang.String[] getClassifier_options()

setClassifier_options

public void setClassifier_options(java.lang.String[] classifier_options)

getPredictionOutputFiles

public java.lang.String[] getPredictionOutputFiles()

setPredictionOutputFiles

public void setPredictionOutputFiles(java.lang.String[] predictionOutputFiles)

getPattern_classification_output_f

public int getPattern_classification_output_f()

setPattern_classification_output_f

public void setPattern_classification_output_f(int pattern_classification_output_f)

getPattern_classification_output_r

public int getPattern_classification_output_r()

setPattern_classification_output_r

public void setPattern_classification_output_r(int pattern_classification_output_r)

getOutput_fixed_alpha

public java.lang.Double getOutput_fixed_alpha()

setOutput_fixed_alpha

public void setOutput_fixed_alpha(java.lang.Double output_fixed_alpha)

getOutput_fixed_beta

public java.lang.Double getOutput_fixed_beta()

setOutput_fixed_beta

public void setOutput_fixed_beta(java.lang.Double output_fixed_beta)

isComplexityCrossvalidation

public boolean isComplexityCrossvalidation()

setComplexityCrossvalidation

public void setComplexityCrossvalidation(boolean complexityCrossvalidation)

getCrossValidationTabFile

public java.io.File getCrossValidationTabFile()

setCrossValidationTabFile

public void setCrossValidationTabFile(java.io.File crossValidationTabFile)

getConversionList

public java.util.List<Conversion> getConversionList()

addConversion

public void addConversion(Conversion c)

setConversionList

public void setConversionList(java.util.List<Conversion> conversionList)