Commit 1c6d2fc15b42977f5d65ee252fd62653b0be546c

Authored by Will Hohyon Ryu ago
1 parent 00ecca08cf
Exists in master and in 1 other branch pikicast

Improved Java interface

Showing 4 changed files with 61 additions and 17 deletions Side-by-side Diff

... ... @@ -21,7 +21,7 @@
21 21 <groupId>com.twitter.penguin</groupId>
22 22 <artifactId>korean-text</artifactId>
23 23 <packaging>jar</packaging>
24   - <version>4.1-SNAPSHOT</version>
  24 + <version>4.0.1-SNAPSHOT</version>
25 25 <name>Korean Text Processing Utilities</name>
26 26 <url>https://github.com/twitter/twitter-korean-text</url>
27 27 <description>Scala library to process Korean text</description>
... ... @@ -39,7 +39,7 @@
39 39 <connection>scm:git:https://github.com/twitter/twitter-korean-text.git</connection>
40 40 <developerConnection>scm:git:git@github.com:twitter/twitter-korean-text.git
41 41 </developerConnection>
42   - <tag>korean-text-4.0</tag>
  42 + <tag>korean-text-4.0.1</tag>
43 43 </scm>
44 44 <prerequisites>
45 45 <maven>3.0.4</maven>
src/main/java/com/twitter/penguin/korean/KoreanTokenJava.java View file @ 1c6d2fc
... ... @@ -32,6 +32,26 @@
32 32 this.unknown = unknown;
33 33 }
34 34  
  35 + public String getText() {
  36 + return text;
  37 + }
  38 +
  39 + public KoreanPosJava getPos() {
  40 + return pos;
  41 + }
  42 +
  43 + public int getOffset() {
  44 + return offset;
  45 + }
  46 +
  47 + public int getLength() {
  48 + return length;
  49 + }
  50 +
  51 + public boolean isUnknown() {
  52 + return unknown;
  53 + }
  54 +
35 55 @Override
36 56 public String toString() {
37 57 String unknownStar = "";
src/main/java/com/twitter/penguin/korean/TwitterKoreanProcessorJava.java View file @ 1c6d2fc
... ... @@ -59,45 +59,59 @@
59 59 }
60 60  
61 61  
  62 +
62 63 /**
63 64 * Transforms the tokenization output to List<KoreanTokenJava>
64 65 *
65 66 * @param tokens Korean tokens (output of tokenize(CharSequence text)).
66 67 * @return List of KoreanTokenJava.
67 68 */
68   - public static List<KoreanTokenJava> tokensToJavaKoreanTokenList(Seq<KoreanToken> tokens) {
  69 + public static List<KoreanTokenJava> tokensToJavaKoreanTokenList(Seq<KoreanToken> tokens, boolean keepSpace) {
69 70 Iterator<KoreanToken> tokenized = tokens.iterator();
70 71 List<KoreanTokenJava> output = Lists.newLinkedList();
71 72 while (tokenized.hasNext()) {
72 73 KoreanToken token = tokenized.next();
73   - output.add(new KoreanTokenJava(
74   - token.text(),
75   - KoreanPosJava.valueOf(token.pos().toString()),
76   - token.offset(),
77   - token.length(),
78   - token.unknown()
79   - ));
  74 + if (keepSpace || token.pos() != KoreanPos.Space()) {
  75 + output.add(new KoreanTokenJava(
  76 + token.text(),
  77 + KoreanPosJava.valueOf(token.pos().toString()),
  78 + token.offset(),
  79 + token.length(),
  80 + token.unknown()
  81 + ));
  82 + }
80 83 }
81 84 return output;
82 85 }
83 86  
  87 + // Default behavior of keepSpace is false
  88 + public static List<KoreanTokenJava> tokensToJavaKoreanTokenList(Seq<KoreanToken> tokens) {
  89 + return tokensToJavaKoreanTokenList(tokens, false);
  90 + }
  91 +
  92 +
84 93 /**
85 94 * Tokenize with the builder options into a String Iterable.
86 95 *
87 96 * @param tokens Korean tokens (output of tokenize(CharSequence text)).
88 97 * @return List of token strings.
89 98 */
90   - public static List<String> tokensToJavaStringList(Seq<KoreanToken> tokens) {
  99 + public static List<String> tokensToJavaStringList(Seq<KoreanToken> tokens, boolean keepSpace) {
91 100 Iterator<KoreanToken> tokenized = tokens.iterator();
92 101 List<String> output = Lists.newLinkedList();
93 102 while (tokenized.hasNext()) {
94 103 final KoreanToken token = tokenized.next();
95 104  
96   - if (token.pos() != KoreanPos.Space()) {
  105 + if (keepSpace || token.pos() != KoreanPos.Space()) {
97 106 output.add(token.text());
98 107 }
99 108 }
100 109 return output;
  110 + }
  111 +
  112 + // Default behavior of keepSpace is false
  113 + public static List<String> tokensToJavaStringList(Seq<KoreanToken> tokens) {
  114 + return tokensToJavaStringList(tokens, false);
101 115 }
102 116  
103 117  
src/test/java/com/twitter/penguin/korean/TwitterKoreanProcessorJavaTest.java View file @ 1c6d2fc
... ... @@ -30,9 +30,8 @@
30 30 Seq<KoreanTokenizer.KoreanToken> stemmed = TwitterKoreanProcessorJava.stem(tokens);
31 31  
32 32 assertEquals(
33   - "[아름답다(Adjective: 0, 4), (Space: 4, 1), 강산(Noun: 5, 2), 을(Josa: 7, 1), " +
34   - " (Space: 8, 1), 귀엽다(Adjective: 9, 4), (Space: 13, 1)," +
35   - " 먹다(Verb: 14, 3), .(Punctuation: 17, 1)]",
  33 + "[아름답다(Adjective: 0, 4), 강산(Noun: 5, 2), 을(Josa: 7, 1), 귀엽다(Adjective: 9, 4), " +
  34 + "먹다(Verb: 14, 3), .(Punctuation: 17, 1)]",
36 35 TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(stemmed).toString());
37 36  
38 37 assertEquals("[아름답다, 강산, 을, 귀엽다, 먹다, .]",
39 38  
... ... @@ -44,8 +43,13 @@
44 43 String text = "착한강아지상을 받은 루루";
45 44 Seq<KoreanTokenizer.KoreanToken> tokens = TwitterKoreanProcessorJava.tokenize(text);
46 45 assertEquals(
  46 + "[착한, 강아지, 상, 을, , 받은, , 루루]",
  47 + TwitterKoreanProcessorJava.tokensToJavaStringList(tokens, true).toString()
  48 + );
  49 +
  50 + assertEquals(
47 51 "[착한, 강아지, 상, 을, 받은, 루루]",
48   - TwitterKoreanProcessorJava.tokensToJavaStringList(tokens).toString()
  52 + TwitterKoreanProcessorJava.tokensToJavaStringList(tokens, false).toString()
49 53 );
50 54 }
51 55  
... ... @@ -56,7 +60,13 @@
56 60 assertEquals(
57 61 "[착한(Adjective: 0, 2), 강아지(Noun: 2, 3), 상(Suffix: 5, 1), 을(Josa: 6, 1), " +
58 62 " (Space: 7, 1), 받은(Verb: 8, 2), (Space: 10, 1), 루루(Noun: 11, 2)]",
59   - TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(tokens).toString()
  63 + TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(tokens, true).toString()
  64 + );
  65 +
  66 + assertEquals(
  67 + "[착한(Adjective: 0, 2), 강아지(Noun: 2, 3), 상(Suffix: 5, 1), 을(Josa: 6, 1), " +
  68 + "받은(Verb: 8, 2), 루루(Noun: 11, 2)]",
  69 + TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(tokens, false).toString()
60 70 );
61 71 }
62 72