Merge pull request #522 from jtengyp/svm

change the form of arguments of SVM to OptionParser
Intel-bigdata · Nov 1, 2017 · dc77558 · dc77558
2 parents 83ac369 + 489654b
commit dc77558
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 30 deletions.
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -96,6 +96,9 @@
     # For SVM
     NUM_EXAMPLES_SVM="hibench.svm.examples",
     NUM_FEATURES_SVM="hibench.svm.examples",
+    NUM_ITERATIONS_SVM="hibench.svm.numIterations",
+    STEPSIZE_SVM="hibench.svm.stepSize",
+    REGPARAM_SVM="hibench.svm.regParam",
     # For ALS
     NUM_USERS_ALS="hibench.als.users",
     NUM_PRODUCTS_ALS="hibench.als.products",

diff --git a/bin/workloads/ml/svm/spark/run.sh b/bin/workloads/ml/svm/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.SVMWithSGDExample ${INPUT_HDFS}
+run_spark_job com.intel.hibench.sparkbench.ml.SVMWithSGDExample --numIterations $NUM_ITERATIONS_SVM --stepSize $STEPSIZE_SVM --regParam $REGPARAM_SVM $INPUT_HDFS
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}

diff --git a/conf/workloads/ml/svm.conf b/conf/workloads/ml/svm.conf
@@ -1,20 +1,24 @@
-hibench.svm.tiny.examples                        1000
-hibench.svm.tiny.features                        1000
-hibench.svm.small.examples                       10000
-hibench.svm.small.features                       10000
-hibench.svm.large.examples                       50000
-hibench.svm.large.features                       100000
-hibench.svm.huge.examples                        120000
-hibench.svm.huge.features                        300000
-hibench.svm.gigantic.examples                    140000
-hibench.svm.gigantic.features                    300000
-hibench.svm.bigdata.examples                     150000
-hibench.svm.bigdata.features                     300000
+hibench.svm.tiny.examples                1000
+hibench.svm.tiny.features                1000
+hibench.svm.small.examples               10000
+hibench.svm.small.features               10000
+hibench.svm.large.examples               50000
+hibench.svm.large.features               100000
+hibench.svm.huge.examples                120000
+hibench.svm.huge.features                300000
+hibench.svm.gigantic.examples            140000
+hibench.svm.gigantic.features            300000
+hibench.svm.bigdata.examples             150000
+hibench.svm.bigdata.features             300000
 
 
 hibench.svm.examples                     ${hibench.svm.${hibench.scale.profile}.examples}
 hibench.svm.features                     ${hibench.svm.${hibench.scale.profile}.features}
 hibench.svm.partitions                   ${hibench.default.map.parallelism}
 
-hibench.workload.input                  ${hibench.hdfs.data.dir}/SVM/Input
-hibench.workload.output                 ${hibench.hdfs.data.dir}/SVM/Output
+hibench.svm.numIterations                100
+hibench.svm.stepSize                     1.0
+hibench.svm.regParam                     0.01
+
+hibench.workload.input                   ${hibench.hdfs.data.dir}/SVM/Input
+hibench.workload.output                  ${hibench.hdfs.data.dir}/SVM/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVMWithSGDExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVMWithSGDExample.scala
@@ -15,40 +15,69 @@
  * limitations under the License.
  */
 
-// scalastyle:off println
 package com.intel.hibench.sparkbench.ml
 
 import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
 import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
-// $example off$
+
+import scopt.OptionParser
 
 object SVMWithSGDExample {
 
+   case class Params(
+     numIterations: Int = 100,
+     stepSize: Double = 1.0,
+     regParam: Double = 0.01,
+     dataPath: String = null
+   )
+
   def main(args: Array[String]): Unit = {
-    var inputPath = ""
-    if (args.length == 1) {
-       inputPath = args(0)
-     }
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("SVM") {
+      head("SVM: an example of SVM for classification.")
+      opt[Int]("numIterations")
+        .text(s"numIterations, default: ${defaultParams.numIterations}")
+        .action((x,c) => c.copy(numIterations = x))
+      opt[Double]("stepSize")
+        .text(s"stepSize, default: ${defaultParams.stepSize}")
+        .action((x,c) => c.copy(stepSize = x))
+      opt[Double]("regParam")
+        .text(s"regParam, default: ${defaultParams.regParam}")
+        .action((x,c) => c.copy(regParam = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("data path of SVM")
+        .action((x, c) => c.copy(dataPath = x)) 
+    }
+    parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
 
-    val conf = new SparkConf().setAppName("SVMWithSGDExample")
+  def run(params: Params): Unit = {
+
+    val conf = new SparkConf().setAppName(s"SVM with $params")
     val sc = new SparkContext(conf)
 
-    // $example on$
-    val data: RDD[LabeledPoint] = sc.objectFile(inputPath)
+    val dataPath = params.dataPath
+    val numIterations = params.numIterations
+    val stepSize = params.stepSize
+    val regParam = params.regParam
+
+    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)
 
     // Split data into training (60%) and test (40%).
     val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
     val training = splits(0).cache()
     val test = splits(1)
 
     // Run training algorithm to build the model
-    val numIterations = 100
-    val model = SVMWithSGD.train(training, numIterations)
+    val model = SVMWithSGD.train(training, numIterations, stepSize, regParam)
 
     // Clear the default threshold.
     model.clearThreshold()
@@ -65,8 +94,6 @@ object SVMWithSGDExample {
 
     println("Area under ROC = " + auROC)
 
-    // Save and load model
     sc.stop()
   }
 }
-// scalastyle:on println