Commit 3f5548ee2360aa9042266fc3528d2f4a64980d7e

Authored by Joanne ago
1 parent 8230469489
Exists in master

temp change for insertcidList

Showing 4 changed files with 51 additions and 4 deletions Side-by-side Diff

app/com/piki_ds/preprocess/CidValidation.scala View file @ 3f5548e
... ... @@ -2,7 +2,7 @@
2 2  
3 3 import org.apache.spark.sql.SQLContext
4 4  
5   -import com.piki_ds.utils.GetTextFile.getDashDump
  5 +import com.piki_ds.utils.GetTextFile.getDBDump
6 6  
7 7  
8 8 /**
... ... @@ -16,7 +16,7 @@
16 16 val whereStr = s"udate is not null and title is not null and" +
17 17 s" contents_type in ('ALBUM', 'ALBUM.A', 'CHST', 'CHST.A','TOON','TOON.A') and " +
18 18 s"status in (${filterStatus.map(x=>s"'$x'").mkString(",")})"
19   - val mgc = getDashDump(sQLContext,"MG_CONTENTS").where(whereStr)
  19 + val mgc = getDBDump(sQLContext,"MG_CONTENTS").where(whereStr)
20 20 val mgContents = mgc.select(mgc("contents_id"),mgc("status"), unix_timestamp(mgc("udate")))
21 21 mgContents.map(x=>{
22 22 val ts = x.getAs[Long]("unixtimestamp(udate,yyyy-MM-dd HH:mm:ss)")
app/com/piki_ds/utils/GetTextFile.scala View file @ 3f5548e
... ... @@ -12,7 +12,7 @@
12 12  
13 13 object GetTextFile {
14 14  
15   - def getDashDump(sQLContext: SQLContext, tableName:String) = {
  15 + def getDBDump(sQLContext: SQLContext, tableName:String) = {
16 16 sQLContext.read.parquet(s"hdfs://pikinn/preprocess/db/table=$tableName/")
17 17 }
18 18  
app/com/piki_ds/ver1/InsertCidList.scala View file @ 3f5548e
  1 +package com.piki_ds.ver1
  2 +
  3 +import com.piki_ds.utils.hbase.HbaseInserter
  4 +import org.apache.spark.SparkContext
  5 +import org.apache.spark.sql.SQLContext
  6 +
  7 +/**
  8 + * Created by jungwon on 5/12/16.
  9 + */
  10 +
  11 +object InsertCidList {
  12 +
  13 + var sc: SparkContext = SparkContext.getOrCreate()
  14 + var sqlContext: SQLContext = SQLContext.getOrCreate(sc)
  15 +
  16 + val modelName: Map[String, Seq[String]] = Map(
  17 + "uniform" ->
  18 + Seq("quality"),
  19 + "single"->
  20 + Seq(
  21 + "cf",
  22 + "topic",
  23 + "age",
  24 + "sex",
  25 + "w2v"
  26 + ),
  27 + "ensemble" ->
  28 + Seq(
  29 + "ensemble1",
  30 + "ensemble2",
  31 + "ensemble3",
  32 + "ensemble4"
  33 + )
  34 + )
  35 +
  36 + modelName("single").foreach(model=> {
  37 + val doi = "20160508"
  38 + val getMax30 = sc.objectFile[(Int, String)](s"/user/joanne/clols/$doi/$model")
  39 + getMax30.groupBy(x=>x._1%1000).foreach(x=>{
  40 + val tableName = s"uuid-cidlist_$model"
  41 + val insertArray = x._2.map(a=>(a._1.toString,a._2)).toArray
  42 + val test = new HbaseInserter(tableName)
  43 + test.insert(insertArray)
  44 + })
  45 + })
  46 +
  47 +}
app/com/piki_ds/ver1/QualityScore.scala View file @ 3f5548e
... ... @@ -75,7 +75,7 @@
75 75 val param = Map("content"->0.5D,"comment"-> 0.30D,"editor"->0.20D)
76 76  
77 77 val finalScore: RDD[(Int, Long)] = combineScores(content,comment,editor,param).map(x=>(x._1.toInt,x._2.toLong)).filter(_._1 != 0)
78   -
  78 + scoreSave(doi,"quality","",finalScore.map(x=>(x._1.toString,x._2)),1)
79 79 val insertArray = finalScore.map(x=>(x._1.toString, x._2.toString)).collect()
80 80 val test = new HbaseInserter("cid_quality")
81 81 test.insert(insertArray)