% Classification problem based on Logistic regression. Factors are % transformed by spline approximation technique (Step 1). Reduction of % factors space is based on the importance of factors in logistic regression % (Step 2-3). Cross-Validation technique is used for testing predictive % quality of the model (step 4). clc clear % Load initial data from MAT files load data_in_1.mat % data_in_1 = Matrix of Factors Values load data_in_dep_1.mat % data_in_dep_1 = Vector of Binary Dependent Variable % Set keys for working with different steps of algorithm key_step_1 = 0; % Set key_step_1 = 1 to work with the first step of the problem key_step_2 = 0; % Set key_step_2 = 1 to work with the second step of the problem key_step_3 = 0; % Set key_step_3 = 1 to work with the third step of the problem key_step_4 = 1; % Set key_step_4 = 1 to work with the fourth step of the problem if key_step_1 == 1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % STEP 1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Prepare splined data %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Input Parameters power=3; % = the degree of spline consclue=3; % = smoothing degree of the spline (< power) numpolinoms=5; % = number of polinoms in the spline % Create problem statement clear toolboxstruc_arr problem_statement problem_statement=sprintf('%s\n',... 'maximize',... ' logexp_sum(spline_sum(matrix_parameters, matrix_one_factor))',... '',... 'Calculate',... 'Point: point_problem_1',... ' logistic(spline_sum_1(matrix_parameters, matrix_one_factor, matrix_one_factor_knots))',... ''); paramdata=[power; numpolinoms; consclue]; %Pack matrix of parameters to PSG Toolbox structure toolboxstruc_arr(1)=tbpsg_matrix_pack('matrix_parameters', paramdata); for num_spline=1:size(data_in_1,2) disp(num_spline) %Pack matrix with one factors data to PSG Toolbox structure toolboxstruc_arr(2) = tbpsg_matrix_pack('matrix_one_factor', data_in_1(:,num_spline),[],data_in_dep_1); % Uncomment the following line to open the problem in Toolbox Window % tbpsg_toolbox(problem_statement,toolboxstruc_arr); % Optimize problem [solution_str,outargstruc_arr] = tbpsg_run(problem_statement, toolboxstruc_arr); % Save solution of the optimization problem to output structure [output_structure] = tbpsg_solution_struct(solution_str, outargstruc_arr); objective_values = tbpsg_objective(solution_str, outargstruc_arr); % Save main results of this step to structure spline_structure(num_spline).name = sprintf('Splined factor number = %g',num_spline); spline_structure(num_spline).splineddata=output_structure.vector_data; spline_structure(num_spline).factor=data_in_1(:,num_spline); spline_structure(num_spline).benchmark=data_in_dep_1; spline_structure(num_spline).objective=objective_values(1); spline_structure(num_spline).knots = output_structure.matrix_data.one_factor_knots; spline_structure(num_spline).quants = output_structure.matrix_data.one_factor_quant; end save spline_structure.mat spline_structure % Uncomment next block to display figure of one spline %{ % n_spline = number of sline to be displayed n_spline = 1; %%%%% Display splines splineYloc=spline_structure(n_spline).splineddata; splineXloc=spline_structure(n_spline).factor; dataforplot1=sortrows([splineXloc,splineYloc],1); splineX=dataforplot1(:,1); splineY=dataforplot1(:,2); nodesx=spline_structure(n_spline).knots; loc1=spline_structure(n_spline).quants; nodesy=[];loc2=0; for j=1:size(loc1,1)-1 loc2=loc2+loc1(j); nodesy=[nodesy,splineY(loc2)]; end h1=figure; title('Splined factor') plot(splineX(1:end),splineY(1:end)) hold on plot(nodesx,nodesy,'d','MarkerSize',10,'Color',[1 0 0]); %filename = sprintf('%s\\Example\\fig_%s', path1, allvars{nfactor}); %saveas(h1, filename, 'jpg'); %close(h1) %} end if key_step_2 == 1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % STEP 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Calculation of the importance of splined factors %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% load spline_structure.mat %spline_structure % Input Parameters param_cut = 0.8; % Percent of factors are left on each step of cycle minimal_number = 50; % Minimal number of factors to be left at the end of Step 2 left_size = size(spline_structure,2); matrix_splined_factors=[]; for j=1:size(spline_structure,2) matrix_splined_factors=[matrix_splined_factors,spline_structure(j).splineddata]; end vars_splined_factors={'x1'}; for i=2:size(spline_structure,2) vars_splined_factors = [vars_splined_factors;sprintf('x%g',i)]; end % Create problem statement clear toolboxstruc_arr problem_statement problem_statement=sprintf('%s\n',... 'maximize',... ' logexp_sum(matrix_splined_factors)',... 'Solver: CAR',... '',... 'Calculate',... 'Point: point_problem_1',... ' logistic(matrix_splined_factors)',... ''); count=0; while left_size > minimal_number count=count+1; vars_splined_factors = ['alpha';vars_splined_factors]; %Pack matrix with splined factors left from the previous step of cycle to %PSG Toolbox structure toolboxstruc_arr(1)=tbpsg_matrix_pack('matrix_splined_factors', [ones(size(matrix_splined_factors,1),1),matrix_splined_factors],vars_splined_factors,data_in_dep_1); % Uncomment the following line to open the problem in Toolbox Window %tbpsg_toolbox(problem_statement,toolboxstruc_arr); % Optimize problem [solution_str,outargstruc_arr] = tbpsg_run(problem_statement, toolboxstruc_arr); % Save solution of the optimization problem to output structure [output_structure] = tbpsg_solution_struct(solution_str, outargstruc_arr); point_data_loc = tbpsg_optimal_point_data(solution_str, outargstruc_arr); point_data = point_data_loc.problem_1; point_var_loc = tbpsg_optimal_point_vars(solution_str, outargstruc_arr); point_var = point_var_loc.problem_1; loc = strfind(point_var,'alpha'); pos_loc = []; for i=1:size(loc,2) if ~isempty(loc{i}) pos_loc=[pos_loc,i]; end end if size(pos_loc,2)>1 || size(pos_loc,2)==0 disp('Error with number of alpha') end %Pack point with values of logistic function to PSG Toolbox structure toolboxstruc_arr(2)=tbpsg_point_pack('point_logistic_regression', point_data,point_var); importance_data = functionincrement('logexp_sum', [], [ones(size(matrix_splined_factors,1),1),matrix_splined_factors], [], [], point_data'); importance_data(pos_loc) = []; point_var(pos_loc) = []; sortloc = sortrows([[1:size(importance_data,1)]',importance_data],2); numdel = round(size(point_var,2)*param_cut); vars_left = point_var(sortloc(end-numdel+1:end,1))'; matrix_splined_factors_left = matrix_splined_factors(:,sortloc(end-numdel+1:end,1)); % Save main results of this step to structure factors_left_importance(count).name = 'Left Splined data after using Importance'; factors_left_importance(count).data = matrix_splined_factors; factors_left_importance(count).vars = vars_splined_factors; factors_left_importance(count).importance = importance_data(sortloc(end-numdel+1:end,1)); factors_left_importance(count).objective = output_structure.objective(1); factors_left_importance(count).point = point_data(sortloc(end-numdel+1:end,1)); factors_left_importance(count).size = left_size; factors_left_importance(count).logistic = output_structure.vector_data; matrix_splined_factors = matrix_splined_factors_left; vars_splined_factors = vars_left; left_size = size(matrix_splined_factors,2); clear toolboxstruc_arr end %while left_size > 50 save factors_left_importance.mat factors_left_importance end %key_step_2 == 1 if key_step_3 == 1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % STEP 3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Logistic Regression with Cardinality Constraint %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% load factors_left_importance.mat %factors_left_importance % Input Parameters cardlevel=0.0001; % Level of Cardinality function cardconst_table=[20,15,10,8,6,5,4,3,2]; % Set of Constraints on Cardinality function = number of factors to be left on every step of cycle last_factor_str = size(factors_left_importance,2); vars_splined_factors = factors_left_importance(last_factor_str).vars; matrix_splined_factors = factors_left_importance(last_factor_str).data; count=0; loc = strfind(vars_splined_factors,'alpha'); pos_loc = []; for i=1:size(loc,2) if ~isempty(loc{i}) pos_loc=[pos_loc,i]; end end vars_splined_factors(pos_loc) = []; for jk=2:size(cardconst_table,2) count=count+1; cardconst = cardconst_table(jk); % Create problem statement clear toolboxstruc_arr problem_statement problem_statement=sprintf('%s\n',... 'maximize',... ' logexp_sum(matrix_splined_factors)',... ['Constraint: constraint_card, upper_bound = ' mat2str(cardconst) ', linearize=1'],... [' cardn(' mat2str(cardlevel) ', matrix_fcard)'],... 'Solver: CAR',... '',... 'Calculate',... 'Point: point_problem_1',... ' logistic(matrix_splined_factors)',... ''); %Pack matrix with splined factors left from the previous step of cycle to %PSG Toolbox structure toolboxstruc_arr(1)=tbpsg_matrix_pack('matrix_splined_factors',[ones(size(matrix_splined_factors,1),1),matrix_splined_factors],['alpha';vars_splined_factors],data_in_dep_1); %Pack matrix with ones for function cardn to PSG Toolbox structure toolboxstruc_arr(2)=tbpsg_matrix_pack('matrix_fcard',ones(1,size(matrix_splined_factors,2)),vars_splined_factors); % Uncomment the following line to open the problem in Toolbox Window %tbpsg_toolbox(problem_statement,toolboxstruc_arr); % Optimize problem [solution_str,outargstruc_arr] = tbpsg_run(problem_statement, toolboxstruc_arr); % Save solution of the optimization problem to output structure [output_structure] = tbpsg_solution_struct(solution_str, outargstruc_arr); point_data_loc = tbpsg_optimal_point_data(solution_str, outargstruc_arr); point_data = point_data_loc.problem_1; point_var_loc = tbpsg_optimal_point_vars(solution_str, outargstruc_arr); point_var = point_var_loc.problem_1; loc = strfind(point_var,'alpha'); pos_loc = []; for i=1:size(loc,2) if ~isempty(loc{i}) pos_loc=[pos_loc,i]; end end if size(pos_loc,2)>1 || size(pos_loc,2)==0 disp('Error with number of alpha') end point_var(pos_loc) = []; point_data(pos_loc) = []; pos_not_nulls = find((point_data<10^(-5))==0); vars_left = point_var(pos_not_nulls)'; matrix_splined_factors_left = matrix_splined_factors(:,pos_not_nulls); vars_splined_factors = vars_left; matrix_splined_factors = matrix_splined_factors_left; % Save main results of this step to structure factors_left_cardinality(count).name = 'Left Splined data after using Cardinality'; factors_left_cardinality(count).data = matrix_splined_factors; factors_left_cardinality(count).vars = vars_splined_factors; factors_left_cardinality(count).objective = output_structure.objective(1); factors_left_cardinality(count).point = point_data(pos_not_nulls); factors_left_cardinality(count).size = size(point_data(pos_not_nulls),2); factors_left_cardinality(count).logistic = output_structure.vector_data; end %for jk=1:10 save factors_left_cardinality.mat factors_left_cardinality end %if key_step_3 == 1 if key_step_4 == 1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % STEP 4 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Cross-Validation for Logistic regression %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% load factors_left_cardinality.mat factors_left_cardinality % Input Parameters CVconst = 4; %Parameter of Cross-Validation = number of divisions of data into in-sample and out-of-sample for count = 1:size(factors_left_cardinality,2) % Create problem statement clear toolboxstruc_arr problem_statement; problem_statement=sprintf('%s\n',... ['for {matrix_fact_in; matrix_fact_out; #n}=CrossValidation(',mat2str(CVconst),', matrix_splined_factors) '],... 'Problem: problem_Logistic_Regression_CV_#n, type = maximize',... 'Objective: objective_max_likelihood',... ' logexp_sum(matrix_fact_in)',... 'Solver: CAR',... '',... 'Calculate',... 'point: point_problem_Logistic_Regression_CV_#n',... ' logexp_sum_1_#n(matrix_fact_out)',... ' logistic_in_#n(matrix_fact_in)',... ' logistic_out_#n(matrix_fact_out)',... 'end for',... ''); matrix_splined_factors = factors_left_cardinality(count).data; vars_splined_factors = factors_left_cardinality(count).vars; %Pack matrix with splined factors left from the previous step of cycle to PSG Toolbox structure toolboxstruc_arr(1)=tbpsg_matrix_pack('matrix_splined_factors',[ones(size(matrix_splined_factors,1),1),matrix_splined_factors],['alpha';vars_splined_factors],data_in_dep_1); % Uncomment the following line to open the problem in Toolbox Window % tbpsg_toolbox(problem_statement,toolboxstruc_arr); % Optimize problem [solution_str,outargstruc_arr] = tbpsg_run(problem_statement, toolboxstruc_arr); % Save solution of the optimization problem to output structure [output_structure] = tbpsg_solution_struct(solution_str, outargstruc_arr); benchmark_loc = data_in_dep_1; for jk=1:CVconst field_out = sprintf('logistic_out_%g',jk); loc = size(output_structure.vector_data.(field_out),1); part_benchmark{jk} = benchmark_loc(1:loc); benchmark_loc(1:loc) = []; end for jk=1:CVconst CV_report_loc(jk).name = 'Cross-Validation for Logistic Regression'; CV_report_loc(jk).objective = output_structure.objective(1+2*(jk-1)); field_in = sprintf('logistic_in_%g',jk); CV_report_loc(jk).logistic_in = output_structure.vector_data.(field_in); %output_structure.vector_data{2*jk}; field_out = sprintf('logistic_out_%g',jk); CV_report_loc(jk).logistic_oos = output_structure.vector_data.(field_out); loc=[1,2,3,4]; loc(find(loc==jk))=[]; CV_report_loc(jk).benchmark_in = [part_benchmark{loc(1)};part_benchmark{loc(2)};part_benchmark{loc(3)}]; CV_report_loc(jk).benchmark_oos = part_benchmark{jk}; end % Calculate AUC for all in-sample and out-of-sample data sets AUC_in=[]; AUC_oos=[]; for jk=1:CVconst AUC_in = [AUC_in,AUCdata(CV_report_loc(jk).benchmark_in, CV_report_loc(jk).logistic_in)]; AUC_oos = [AUC_oos, AUCdata(CV_report_loc(jk).benchmark_oos, CV_report_loc(jk).logistic_oos)]; end % Save main results of this step to structure CV_report(count).full = CV_report_loc; CV_report(count).AUC_in_sample = AUC_in; CV_report(count).AUC_oos_sample = AUC_oos; clear CV_report_loc AUC_in AUC_oos end %for count = 1:size(factors_left_cardinality,2) save CV_report.mat CV_report % Calculate mean AUC for in-sample and out-of-sample data sets AUC_in_composed=[];AUC_oos_composed=[]; for jk=1:size(CV_report,2) AUC_in_composed=[AUC_in_composed,mean(CV_report(jk).AUC_in_sample)]; AUC_oos_composed=[AUC_oos_composed,mean(CV_report(jk).AUC_oos_sample)]; end save report_AUC_CV.mat AUC_in_composed AUC_oos_composed CV_report end %if key_step_4 == 1