Commit fcac93ed76c9c55fce2628e4bc3bc1bb0f32e7e4

Authored by Will Hohyon Ryu ago
Exists in master and in 1 other branch pikicast

Merge pull request #61 from twitter/60_KoreanPos

60 korean pos

Showing 7 changed files Side-by-side Diff

examples/pom.xml View file @ fcac93e
... ... @@ -32,7 +32,7 @@
32 32 <dependency>
33 33 <groupId>com.twitter.penguin</groupId>
34 34 <artifactId>korean-text</artifactId>
35   - <version>2.4.2</version>
  35 + <version>3.0</version>
36 36 </dependency>
37 37 </dependencies>
38 38 <build>
examples/src/main/java/JavaTwitterKoreanTextExample.java View file @ fcac93e
... ... @@ -19,6 +19,7 @@
19 19 import java.util.List;
20 20  
21 21 import com.twitter.penguin.korean.TwitterKoreanProcessorJava;
  22 +import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor;
22 23 import com.twitter.penguin.korean.tokenizer.KoreanTokenizer;
23 24  
24 25 public class JavaTwitterKoreanTextExample {
... ... @@ -33,7 +34,7 @@
33 34 List<KoreanTokenizer.KoreanToken> parsed = processor
34 35 .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
35 36 System.out.println(parsed);
36   - // output: [한국어Noun, 를Josa, 처리Noun, 하다Verb, 예시Noun, 이다Adjective, ㅋㅋKoreanParticle]
  37 + // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2)]
37 38  
38 39  
39 40 // Tokenize without stemmer
40 41  
... ... @@ -43,11 +44,11 @@
43 44  
44 45 parsedStrings = processor.tokenizeToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
45 46 System.out.println(parsedStrings);
46   - // output: [한국어, 를, 처리, 하는, 예시, 입, 니다, ㅋㅋ]
  47 + // output: [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ]
47 48  
48 49 parsed = processor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
49 50 System.out.println(parsed);
50   - // output: [한국어Noun, 를Josa, 처리Noun, 하는Verb, 예시Noun, 입Adjective, 니다Eomi, ㅋㅋKoreanParticle]
  51 + // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2)]
51 52  
52 53  
53 54 // Tokenize with neither normalization nor stemmer
54 55  
55 56  
56 57  
... ... @@ -58,16 +59,16 @@
58 59  
59 60 parsedStrings = processor.tokenizeToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
60 61 System.out.println(parsedStrings);
61   - // output: [한국어, 를, 처리, 하는, 예시, 입, 니, 닼, ㅋㅋㅋㅋㅋ]
  62 + // output: [한국어, 를, 처리, 하는, 예시, 입니, 닼, ㅋㅋㅋㅋㅋ]
62 63  
63 64 parsed = processor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
64 65 System.out.println(parsed);
65   - // output: [한국어Noun, 를Josa, 처리Noun, 하는Verb, 예시Noun, 입Noun, 니Josa, 닼Noun*, ㅋㅋㅋㅋㅋKoreanParticle]
  66 + // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 닼*(Noun: 14, 1), ㅋㅋㅋㅋㅋ(KoreanParticle: 15, 5)]
66 67  
67   - List<CharSequence> phrases = processor
  68 + List<KoreanPhraseExtractor.KoreanPhrase> phrases = processor
68 69 .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발");
69 70 System.out.println(phrases);
70   - // output: [한국어, 처리, 처리하는 예시, 예시, 시발]
  71 + // output: [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), 시발(Noun: 18, 2)]
71 72  
72 73 processor = new TwitterKoreanProcessorJava.Builder()
73 74 .disableNormalizer()
... ... @@ -77,7 +78,7 @@
77 78  
78 79 phrases = processor.extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발");
79 80 System.out.println(phrases);
80   - // output: [한국어, 처리, 처리하는 예시, 예시]
  81 + // output: [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2)]
81 82 }
82 83 }
examples/src/main/scala/ScalaTwitterKoreanTextExample.scala View file @ fcac93e
... ... @@ -17,6 +17,7 @@
17 17 */
18 18  
19 19 import com.twitter.penguin.korean.TwitterKoreanProcessor
  20 +import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor.KoreanPhrase
20 21 import com.twitter.penguin.korean.tokenizer.KoreanTokenizer
21 22  
22 23 object ScalaTwitterKoreanTextExample {
23 24  
24 25  
25 26  
26 27  
27 28  
28 29  
29 30  
... ... @@ -25,38 +26,38 @@
25 26 val parsed: Seq[String] = TwitterKoreanProcessor
26 27 .tokenizeToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ")
27 28 println(parsed)
28   - // ArraySeq(한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ)
  29 + // List(한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ)
29 30  
30 31 // Tokenize with Part-of-Speech information
31 32 val parsedPos: Seq[KoreanTokenizer.KoreanToken] =
32 33 TwitterKoreanProcessor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ")
33 34 println(parsedPos)
34   - // ArraySeq(한국어Noun, 를Josa, 처리Noun, 하다Verb, 예시Noun, 이다Adjective, ㅋㅋKoreanParticle)
  35 + // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2))
35 36  
36 37 // Tokenize without stemming
37 38 val parsedPosNoStemming: Seq[KoreanTokenizer.KoreanToken] =
38 39 TwitterKoreanProcessor
39 40 .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ", normalizization = true, stemming = false)
40 41 println(parsedPosNoStemming)
41   - // ArraySeq(한국어Noun, 를Josa, 처리Noun, 하는Verb, 예시Noun, 입Adjective, 니다Eomi, ㅋㅋKoreanParticle)
  42 + // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2))
42 43  
43 44 // Tokenize without normalization and stemming
44 45 val parsedPosParsingOnly: Seq[KoreanTokenizer.KoreanToken] = TwitterKoreanProcessor
45 46 .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ", normalizization = false, stemming = false)
46 47 println(parsedPosParsingOnly)
47   - // ArraySeq(한국어Noun, 를Josa, 처리Noun, 하는Verb, 예시Noun, 입Noun, 니Josa, 닼Noun*, ㅋㅋㅋㅋㅋKoreanParticle)
  48 + // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 닼*(Noun: 14, 1), ㅋㅋㅋㅋㅋ(KoreanParticle: 15, 5))
48 49  
49 50 // Phrase extraction
50   - val phrases: Seq[CharSequence] = TwitterKoreanProcessor
  51 + val phrases: Seq[KoreanPhrase] = TwitterKoreanProcessor
51 52 .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발")
52 53 println(phrases)
53   - // List(한국어, 처리, 처리하는 예시, 예시, 시발)
  54 + // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), 시발(Noun: 18, 2))
54 55  
55 56 // Phrase extraction with the spam filter enabled
56   - val phrasesSpamFilitered: Seq[CharSequence] = TwitterKoreanProcessor
  57 + val phrasesSpamFilitered: Seq[KoreanPhrase] = TwitterKoreanProcessor
57 58 .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발", filterSpam = true)
58 59 println(phrasesSpamFilitered)
59   - // List(한국어, 처리, 처리하는 예시, 예시)
  60 + // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2))
60 61 }
61 62 }
... ... @@ -21,7 +21,7 @@
21 21 <groupId>com.twitter.penguin</groupId>
22 22 <artifactId>korean-text</artifactId>
23 23 <packaging>jar</packaging>
24   - <version>3.0-SNAPSHOT</version>
  24 + <version>3.1-SNAPSHOT</version>
25 25 <name>Korean Text Processing Utilities</name>
26 26 <url>https://github.com/twitter/twitter-korean-text</url>
27 27 <description>Scala library to process Korean text</description>
... ... @@ -39,7 +39,7 @@
39 39 <connection>scm:git:https://github.com/twitter/twitter-korean-text.git</connection>
40 40 <developerConnection>scm:git:git@github.com:twitter/twitter-korean-text.git
41 41 </developerConnection>
42   - <tag>korean-text-3.0</tag>
  42 + <tag>korean-text-3.1</tag>
43 43 </scm>
44 44 <prerequisites>
45 45 <maven>3.0.4</maven>
... ... @@ -103,6 +103,11 @@
103 103 <artifactId>junit</artifactId>
104 104 <version>4.11</version>
105 105 <scope>test</scope>
  106 + </dependency>
  107 + <dependency>
  108 + <groupId>com.google.guava</groupId>
  109 + <artifactId>guava</artifactId>
  110 + <version>16.0.1</version>
106 111 </dependency>
107 112 </dependencies>
108 113 <reporting>
src/main/java/com/twitter/penguin/korean/KoreanPosJava.java View file @ fcac93e
  1 +/*
  2 + * Twitter Korean Text - Scala library to process Korean text
  3 + *
  4 + * Copyright 2015 Twitter, Inc.
  5 + *
  6 + * Licensed under the Apache License, Version 2.0 (the "License");
  7 + * you may not use this file except in compliance with the License.
  8 + * You may obtain a copy of the License at
  9 + *
  10 + * http://www.apache.org/licenses/LICENSE-2.0
  11 + *
  12 + * Unless required by applicable law or agreed to in writing, software
  13 + * distributed under the License is distributed on an "AS IS" BASIS,
  14 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 + * See the License for the specific language governing permissions and
  16 + * limitations under the License.
  17 + */
  18 +package com.twitter.penguin.korean;
  19 +
  20 +/**
  21 + * These enum class doesn't follow Java's POS capitalization convention intentionally
  22 + * to match with Scala's com.twitter.penguin.korean.util.KoreanPos enumeration.
  23 + */
  24 +public enum KoreanPosJava {
  25 + // Word leved POS
  26 + Noun, Verb, Adjective,
  27 + Adverb, Determiner, Exclamation,
  28 + Josa, Eomi, PreEomi, Conjunction,
  29 + NounPrefix, VerbPrefix, Suffix, Unknown,
  30 +
  31 + // Chunk level POS
  32 + Korean, Foreign, Number, KoreanParticle, Alpha,
  33 + Punctuation, Hashtag, ScreenName,
  34 + Email, URL, CashTag,
  35 +
  36 + // Functional POS
  37 + Space, Others;
  38 +}
src/main/java/com/twitter/penguin/korean/KoreanTokenJava.java View file @ fcac93e
  1 +/*
  2 + * Twitter Korean Text - Scala library to process Korean text
  3 + *
  4 + * Copyright 2015 Twitter, Inc.
  5 + *
  6 + * Licensed under the Apache License, Version 2.0 (the "License");
  7 + * you may not use this file except in compliance with the License.
  8 + * You may obtain a copy of the License at
  9 + *
  10 + * http://www.apache.org/licenses/LICENSE-2.0
  11 + *
  12 + * Unless required by applicable law or agreed to in writing, software
  13 + * distributed under the License is distributed on an "AS IS" BASIS,
  14 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 + * See the License for the specific language governing permissions and
  16 + * limitations under the License.
  17 + */
  18 +package com.twitter.penguin.korean;
  19 +
  20 +public class KoreanTokenJava {
  21 + String text;
  22 + KoreanPosJava pos;
  23 + int offset;
  24 + int length;
  25 + boolean unknown;
  26 +
  27 + public KoreanTokenJava(String text, KoreanPosJava pos, int offset, int length, boolean unknown) {
  28 + this.text = text;
  29 + this.pos = pos;
  30 + this.offset = offset;
  31 + this.length = length;
  32 + this.unknown = unknown;
  33 + }
  34 +
  35 + @Override
  36 + public String toString() {
  37 + String unknownStar = "";
  38 + if (unknown){
  39 + unknownStar = "*";
  40 + }
  41 + return String.format("%s%s(%s: %d, %d)", text, unknownStar, pos.toString(), offset, length);
  42 + }
  43 +}
src/main/java/com/twitter/penguin/korean/TwitterKoreanProcessorJava.java View file @ fcac93e
... ... @@ -20,11 +20,13 @@
20 20  
21 21 import java.util.List;
22 22  
  23 +import scala.collection.Iterator;
23 24 import scala.collection.JavaConversions;
24 25 import scala.collection.Seq;
25 26  
  27 +import com.google.common.collect.Lists;
  28 +
26 29 import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor;
27   -import com.twitter.penguin.korean.stemmer.KoreanStemmer;
28 30 import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken;
29 31  
30 32 /**
31 33  
... ... @@ -77,11 +79,24 @@
77 79 * @param text Input text.
78 80 * @return A list of Korean Tokens
79 81 */
80   - public List<KoreanToken> tokenize(CharSequence text) {
81   - Seq<KoreanToken> tokenized = TwitterKoreanProcessor.tokenize(
  82 + public List<KoreanTokenJava> tokenize(CharSequence text) {
  83 + Iterator<KoreanToken> tokenized = TwitterKoreanProcessor.tokenize(
82 84 text, normalizerEnabled, stemmerEnabled, keepSpaceEnabled
83   - );
84   - return JavaConversions.seqAsJavaList(tokenized);
  85 + ).iterator();
  86 +
  87 + List<KoreanTokenJava> output = Lists.newLinkedList();
  88 + while (tokenized.hasNext()) {
  89 + KoreanToken token = tokenized.next();
  90 + output.add(new KoreanTokenJava(
  91 + token.text(),
  92 + KoreanPosJava.valueOf(token.pos().toString()),
  93 + token.offset(),
  94 + token.length(),
  95 + token.unknown()
  96 + ));
  97 +
  98 + }
  99 + return output;
85 100 }
86 101  
87 102 /**