Commit 00ecca08cfe6ad2ec4e2d8e287be9a203b3598ee

Authored by Will Hohyon Ryu ago
1 parent 68edd26e09
Exists in master and in 1 other branch pikicast

Examples updated for 4.0

Showing 3 changed files with 41 additions and 78 deletions Side-by-side Diff

examples/pom.xml View file @ 00ecca0
... ... @@ -32,7 +32,7 @@
32 32 <dependency>
33 33 <groupId>com.twitter.penguin</groupId>
34 34 <artifactId>korean-text</artifactId>
35   - <version>3.0</version>
  35 + <version>4.0</version>
36 36 </dependency>
37 37 </dependencies>
38 38 <build>
examples/src/main/java/JavaTwitterKoreanTextExample.java View file @ 00ecca0
... ... @@ -18,67 +18,44 @@
18 18  
19 19 import java.util.List;
20 20  
  21 +import scala.collection.Seq;
  22 +
  23 +import com.twitter.penguin.korean.TwitterKoreanProcessor;
21 24 import com.twitter.penguin.korean.TwitterKoreanProcessorJava;
22 25 import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor;
23 26 import com.twitter.penguin.korean.tokenizer.KoreanTokenizer;
24 27  
25 28 public class JavaTwitterKoreanTextExample {
26 29 public static void main(String[] args) {
27   - // Tokenize with normalization + stemmer
28   - TwitterKoreanProcessorJava processor = new TwitterKoreanProcessorJava.Builder().build();
  30 + String text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어";
29 31  
30   - List<String> parsedStrings = processor.tokensToJavaStringList("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
31   - System.out.println(parsedStrings);
32   - // output: [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ]
  32 + // Normalize
  33 + CharSequence normalized = TwitterKoreanProcessorJava.normalize(text);
  34 + System.out.println(normalized);
  35 + // 한국어를 처리하는 예시입니다ㅋㅋ #한국어
33 36  
34   - List<KoreanTokenizer.KoreanToken> parsed = processor
35   - .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
36   - System.out.println(parsed);
37   - // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2)]
38 37  
  38 + // Tokenize
  39 + Seq<KoreanTokenizer.KoreanToken> tokens = TwitterKoreanProcessorJava.tokenize(normalized);
  40 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaStringList(tokens));
  41 + // [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ, #한국어]
  42 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(tokens));
  43 + // [한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4)]
39 44  
40   - // Tokenize without stemmer
41   - processor = new TwitterKoreanProcessorJava.Builder()
42   - .disableStemmer()
43   - .build();
44 45  
45   - parsedStrings = processor.tokensToJavaStringList("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
46   - System.out.println(parsedStrings);
47   - // output: [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ]
  46 + // Stemming
  47 + Seq<KoreanTokenizer.KoreanToken> stemmed = TwitterKoreanProcessorJava.stem(tokens);
  48 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaStringList(stemmed));
  49 + // [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ, #한국어]
  50 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(stemmed));
  51 + // [한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4)]
48 52  
49   - parsed = processor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
50   - System.out.println(parsed);
51   - // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2)]
52 53  
53   -
54   - // Tokenize with neither normalization nor stemmer
55   - processor = new TwitterKoreanProcessorJava.Builder()
56   - .disableNormalizer()
57   - .disableStemmer()
58   - .build();
59   -
60   - parsedStrings = processor.tokensToJavaStringList("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
61   - System.out.println(parsedStrings);
62   - // output: [한국어, 를, 처리, 하는, 예시, 입니, 닼, ㅋㅋㅋㅋㅋ]
63   -
64   - parsed = processor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
65   - System.out.println(parsed);
66   - // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 닼*(Noun: 14, 1), ㅋㅋㅋㅋㅋ(KoreanParticle: 15, 5)]
67   -
68   - List<KoreanPhraseExtractor.KoreanPhrase> phrases = processor
69   - .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발");
  54 + // Phrase extraction
  55 + List<KoreanPhraseExtractor.KoreanPhrase> phrases = TwitterKoreanProcessorJava.extractPhrases(tokens, true, true);
70 56 System.out.println(phrases);
71   - // output: [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), 시발(Noun: 18, 2)]
  57 + // [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), #한국어(Hashtag: 18, 4)]
72 58  
73   - processor = new TwitterKoreanProcessorJava.Builder()
74   - .disableNormalizer()
75   - .disableStemmer()
76   - .enablePhraseExtractorSpamFilter()
77   - .build();
78   -
79   - phrases = processor.extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발");
80   - System.out.println(phrases);
81   - // output: [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2)]
82 59 }
83 60 }
examples/src/main/scala/ScalaTwitterKoreanTextExample.scala View file @ 00ecca0
... ... @@ -18,46 +18,32 @@
18 18  
19 19 import com.twitter.penguin.korean.TwitterKoreanProcessor
20 20 import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor.KoreanPhrase
21   -import com.twitter.penguin.korean.tokenizer.KoreanTokenizer
  21 +import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken
22 22  
23 23 object ScalaTwitterKoreanTextExample {
24 24 def main(args: Array[String]) {
25   - // Tokenize into List<String>
26   - val parsed: Seq[String] = TwitterKoreanProcessor
27   - .tokensToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ")
28   - println(parsed)
29   - // List(한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ)
  25 + val text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
30 26  
31   - // Tokenize with Part-of-Speech information
32   - val parsedPos: Seq[KoreanTokenizer.KoreanToken] =
33   - TwitterKoreanProcessor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ")
34   - println(parsedPos)
35   - // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2))
  27 + // Normalize
  28 + val normalized: CharSequence = TwitterKoreanProcessor.normalize(text)
  29 + println(normalized)
  30 + // 한국어를 처리하는 예시입니다ㅋㅋ #한국어
36 31  
37   - // Tokenize without stemming
38   - val parsedPosNoStemming: Seq[KoreanTokenizer.KoreanToken] =
39   - TwitterKoreanProcessor
40   - .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ", normalizization = true, stemming = false)
41   - println(parsedPosNoStemming)
42   - // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2))
  32 + // Tokenize
  33 + val tokens: Seq[KoreanToken] = TwitterKoreanProcessor.tokenize(normalized)
  34 + println(tokens)
  35 + // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4))
43 36  
44   - // Tokenize without normalization and stemming
45   - val parsedPosParsingOnly: Seq[KoreanTokenizer.KoreanToken] = TwitterKoreanProcessor
46   - .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ", normalizization = false, stemming = false)
47   - println(parsedPosParsingOnly)
48   - // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 닼*(Noun: 14, 1), ㅋㅋㅋㅋㅋ(KoreanParticle: 15, 5))
  37 + // Stemming
  38 + val stemmed: Seq[KoreanToken] = TwitterKoreanProcessor.stem(tokens)
49 39  
  40 + println(stemmed)
  41 + // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4))
  42 +
50 43 // Phrase extraction
51   - val phrases: Seq[KoreanPhrase] = TwitterKoreanProcessor
52   - .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발")
  44 + val phrases: Seq[KoreanPhrase] = TwitterKoreanProcessor.extractPhrases(tokens, filterSpam = true, enableHashtags = true)
53 45 println(phrases)
54   - // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), 시발(Noun: 18, 2))
55   -
56   - // Phrase extraction with the spam filter enabled
57   - val phrasesSpamFilitered: Seq[KoreanPhrase] = TwitterKoreanProcessor
58   - .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발", filterSpam = true)
59   - println(phrasesSpamFilitered)
60   - // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2))
  46 + // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), #한국어(Hashtag: 18, 4))
61 47 }
62 48 }