I have been working on a proof-of-concept setup of evaluating various PMML-based models. Specifically, I've been using the JPMML evaluator (https://github.com/jpmml/jpmml-evaluator), which is in Java, calling it from Scala. Among several toy models I have been tinkering with, I created a PMML file for a linear regression model that takes in 10,000 boolean arguments (each one with the value of 1 or 0) and then evaluates the dot-product of the 10,000-long vector of weights (defined in the PMML file under the tag) with the 10,000-long vector of inputs, only one of the entries being 1 and the rest being 0. (Essentially, this amounts to a lookup of the weight by the associated parameter.)
Surprisingly, I discovered that it took about 2 seconds (!) for the PMML evaluator to evaluate such a simple thing--and that, with the PMML already pre-parsed when calling the evaluate() method! Any suggestions on why?
The PMML file for 10,000 parameters is, obviously, rather long. A similar file with only 10 parameters is attached:
<PMML version="4.2" xmlns="http://www.dmg.org/PMML-4_2">
<Header description="Generalized Linear Regression Model">
<Application name="Test Model" version="1.0"/>
</Header>
<DataDictionary numberOfFields="11">
<DataField name="score" optype="continuous" dataType="double"/>
<DataField name="1" optype="continuous" dataType="integer"/>
<DataField name="2" optype="continuous" dataType="integer"/>
<DataField name="3" optype="continuous" dataType="integer"/>
<DataField name="4" optype="continuous" dataType="integer"/>
<DataField name="5" optype="continuous" dataType="integer"/>
<DataField name="6" optype="continuous" dataType="integer"/>
<DataField name="7" optype="continuous" dataType="integer"/>
<DataField name="8" optype="continuous" dataType="integer"/>
<DataField name="9" optype="continuous" dataType="integer"/>
<DataField name="10" optype="continuous" dataType="integer"/>
</DataDictionary>
<GeneralRegressionModel modelName="Test Model" modelType="generalizedLinear" functionName="regression" algorithmName="glm" distribution="normal" linkFunction="logit">
<MiningSchema>
<MiningField name="score" usageType="predicted"/>
<MiningField name="1" usageType="active"/>
<MiningField name="2" usageType="active"/>
<MiningField name="3" usageType="active"/>
<MiningField name="4" usageType="active"/>
<MiningField name="5" usageType="active"/>
<MiningField name="6" usageType="active"/>
<MiningField name="7" usageType="active"/>
<MiningField name="8" usageType="active"/>
<MiningField name="9" usageType="active"/>
<MiningField name="10" usageType="active"/>
</MiningSchema>
<Output>
<OutputField name="score" feature="predictedValue"/>
</Output>
<ParameterList>
<Parameter name="score_param" label="(Intercept)"/>
<Parameter name="1" label="1"/>
<Parameter name="2" label="2"/>
<Parameter name="3" label="3"/>
<Parameter name="4" label="4"/>
<Parameter name="5" label="5"/>
<Parameter name="6" label="6"/>
<Parameter name="7" label="7"/>
<Parameter name="8" label="8"/>
<Parameter name="9" label="9"/>
<Parameter name="10" label="10"/>
</ParameterList>
<CovariateList>
<Predictor name="1"/>
<Predictor name="2"/>
<Predictor name="3"/>
<Predictor name="4"/>
<Predictor name="5"/>
<Predictor name="6"/>
<Predictor name="7"/>
<Predictor name="8"/>
<Predictor name="9"/>
<Predictor name="10"/>
</CovariateList>
<PPMatrix>
<PPCell value="1" predictorName="1" parameterName="1"/>
<PPCell value="1" predictorName="2" parameterName="2"/>
<PPCell value="1" predictorName="3" parameterName="3"/>
<PPCell value="1" predictorName="4" parameterName="4"/>
<PPCell value="1" predictorName="5" parameterName="5"/>
<PPCell value="1" predictorName="6" parameterName="6"/>
<PPCell value="1" predictorName="7" parameterName="7"/>
<PPCell value="1" predictorName="8" parameterName="8"/>
<PPCell value="1" predictorName="9" parameterName="9"/>
<PPCell value="1" predictorName="10" parameterName="10"/>
</PPMatrix>
<ParamMatrix>
<PCell parameterName="score_param" df="1" beta="0"/>
<PCell parameterName="1" df="1" beta="0.9011255655311873"/>
<PCell parameterName="2" df="1" beta="0.9213215091770879"/>
<PCell parameterName="3" df="1" beta="0.3623167363776304"/>
<PCell parameterName="4" df="1" beta="0.2611235035721956"/>
<PCell parameterName="5" df="1" beta="0.22401829251069683"/>
<PCell parameterName="6" df="1" beta="0.47040738391130854"/>
<PCell parameterName="7" df="1" beta="0.37109087550595976"/>
<PCell parameterName="8" df="1" beta="0.6147636550988892"/>
<PCell parameterName="9" df="1" beta="0.2719585907349067"/>
<PCell parameterName="10" df="1" beta="0.1219419730278416"/>
</ParamMatrix>
</GeneralRegressionModel>
</PMML>