Commit 07d38baf2a22c8fea57b9c970bebf3da1abbccd7

Authored by Will Hohyon Ryu ago
1 parent 00ecca08cf
Exists in master and in 1 other branch pikicast

Update README.md

Showing 1 changed file with 48 additions and 84 deletions Side-by-side Diff

... ... @@ -45,23 +45,24 @@
45 45 <dependency>
46 46 <groupId>com.twitter.penguin</groupId>
47 47 <artifactId>korean-text</artifactId>
48   - <version>3.0</version>
  48 + <version>4.0</version>
49 49 </dependency>
50 50 ```
51 51  
52 52 The maven site is available here http://twitter.github.io/twitter-korean-text/ and scaladocs are here http://twitter.github.io/twitter-korean-text/scaladocs/
53 53  
54   -## .net Wrapper
  54 +## Wrappers
  55 +### .net Wrapper
55 56  
56 57 [modamoda](https://github.com/modamoda) created repository and built simple C# wrapper project.
57 58  
58 59 Please visit [https://github.com/modamoda/TwitterKoreanProcessorCS](https://github.com/modamoda/TwitterKoreanProcessorCS) for further information!
59 60  
60   -## node.js Wrapper
  61 +### node.js Wrapper
61 62  
62 63 [Ch0p](https://github.com/Ch0p) Kindly offered an awesome node.js wrapper. Check it out here: [twtkrjs](https://github.com/Ch0p/twtkrjs)
63 64  
64   -## Python Wrapper
  65 +### Python Wrapper
65 66  
66 67 [Jaepil Jeong](https://github.com/jaepil) kindly offered a Python wrapper: https://github.com/jaepil/twkorean
67 68  
68 69  
69 70  
70 71  
71 72  
72 73  
73 74  
74 75  
... ... @@ -91,46 +92,32 @@
91 92 ```scala
92 93 import com.twitter.penguin.korean.TwitterKoreanProcessor
93 94 import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor.KoreanPhrase
94   -import com.twitter.penguin.korean.tokenizer.KoreanTokenizer
  95 +import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken
95 96  
96 97 object ScalaTwitterKoreanTextExample {
97 98 def main(args: Array[String]) {
98   - // Tokenize into List<String>
99   - val parsed: Seq[String] = TwitterKoreanProcessor
100   - .tokenizeToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ")
101   - println(parsed)
102   - // List(한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ)
  99 + val text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
103 100  
104   - // Tokenize with Part-of-Speech information
105   - val parsedPos: Seq[KoreanTokenizer.KoreanToken] =
106   - TwitterKoreanProcessor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ")
107   - println(parsedPos)
108   - // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2))
  101 + // Normalize
  102 + val normalized: CharSequence = TwitterKoreanProcessor.normalize(text)
  103 + println(normalized)
  104 + // 한국어를 처리하는 예시입니다ㅋㅋ #한국어
109 105  
110   - // Tokenize without stemming
111   - val parsedPosNoStemming: Seq[KoreanTokenizer.KoreanToken] =
112   - TwitterKoreanProcessor
113   - .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ", normalizization = true, stemming = false)
114   - println(parsedPosNoStemming)
115   - // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2))
  106 + // Tokenize
  107 + val tokens: Seq[KoreanToken] = TwitterKoreanProcessor.tokenize(normalized)
  108 + println(tokens)
  109 + // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4))
116 110  
117   - // Tokenize without normalization and stemming
118   - val parsedPosParsingOnly: Seq[KoreanTokenizer.KoreanToken] = TwitterKoreanProcessor
119   - .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ", normalizization = false, stemming = false)
120   - println(parsedPosParsingOnly)
121   - // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 닼*(Noun: 14, 1), ㅋㅋㅋㅋㅋ(KoreanParticle: 15, 5))
  111 + // Stemming
  112 + val stemmed: Seq[KoreanToken] = TwitterKoreanProcessor.stem(tokens)
122 113  
  114 + println(stemmed)
  115 + // List(한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4))
  116 +
123 117 // Phrase extraction
124   - val phrases: Seq[KoreanPhrase] = TwitterKoreanProcessor
125   - .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발")
  118 + val phrases: Seq[KoreanPhrase] = TwitterKoreanProcessor.extractPhrases(tokens, filterSpam = true, enableHashtags = true)
126 119 println(phrases)
127   - // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), 시발(Noun: 18, 2))
128   -
129   - // Phrase extraction with the spam filter enabled
130   - val phrasesSpamFilitered: Seq[KoreanPhrase] = TwitterKoreanProcessor
131   - .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발", filterSpam = true)
132   - println(phrasesSpamFilitered)
133   - // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2))
  120 + // List(한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), #한국어(Hashtag: 18, 4))
134 121 }
135 122 }
136 123 ```
137 124  
138 125  
139 126  
140 127  
141 128  
142 129  
143 130  
144 131  
145 132  
146 133  
... ... @@ -139,67 +126,44 @@
139 126 ```java
140 127 import java.util.List;
141 128  
  129 +import scala.collection.Seq;
  130 +
  131 +import com.twitter.penguin.korean.TwitterKoreanProcessor;
142 132 import com.twitter.penguin.korean.TwitterKoreanProcessorJava;
143 133 import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor;
144 134 import com.twitter.penguin.korean.tokenizer.KoreanTokenizer;
145 135  
146 136 public class JavaTwitterKoreanTextExample {
147 137 public static void main(String[] args) {
148   - // Tokenize with normalization + stemmer
149   - TwitterKoreanProcessorJava processor = new TwitterKoreanProcessorJava.Builder().build();
  138 + String text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어";
150 139  
151   - List<String> parsedStrings = processor.tokenizeToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
152   - System.out.println(parsedStrings);
153   - // output: [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ]
  140 + // Normalize
  141 + CharSequence normalized = TwitterKoreanProcessorJava.normalize(text);
  142 + System.out.println(normalized);
  143 + // 한국어를 처리하는 예시입니다ㅋㅋ #한국어
154 144  
155   - List<KoreanTokenizer.KoreanToken> parsed = processor
156   - .tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
157   - System.out.println(parsed);
158   - // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2)]
159 145  
  146 + // Tokenize
  147 + Seq<KoreanTokenizer.KoreanToken> tokens = TwitterKoreanProcessorJava.tokenize(normalized);
  148 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaStringList(tokens));
  149 + // [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ, #한국어]
  150 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(tokens));
  151 + // [한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4)]
160 152  
161   - // Tokenize without stemmer
162   - processor = new TwitterKoreanProcessorJava.Builder()
163   - .disableStemmer()
164   - .build();
165 153  
166   - parsedStrings = processor.tokenizeToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
167   - System.out.println(parsedStrings);
168   - // output: [한국어, 를, 처리, 하는, 예시, 입니, 다, ㅋㅋ]
  154 + // Stemming
  155 + Seq<KoreanTokenizer.KoreanToken> stemmed = TwitterKoreanProcessorJava.stem(tokens);
  156 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaStringList(stemmed));
  157 + // [한국어, 를, 처리, 하다, 예시, 이다, ㅋㅋ, #한국어]
  158 + System.out.println(TwitterKoreanProcessorJava.tokensToJavaKoreanTokenList(stemmed));
  159 + // [한국어(Noun: 0, 3), 를(Josa: 3, 1), (Space: 4, 1), 처리(Noun: 5, 2), 하다(Verb: 7, 2), (Space: 9, 1), 예시(Noun: 10, 2), 이다(Adjective: 12, 3), ㅋㅋ(KoreanParticle: 15, 2), (Space: 17, 1), #한국어(Hashtag: 18, 4)]
169 160  
170   - parsed = processor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
171   - System.out.println(parsed);
172   - // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 다(Eomi: 14, 1), ㅋㅋ(KoreanParticle: 15, 2)]
173 161  
174   -
175   - // Tokenize with neither normalization nor stemmer
176   - processor = new TwitterKoreanProcessorJava.Builder()
177   - .disableNormalizer()
178   - .disableStemmer()
179   - .build();
180   -
181   - parsedStrings = processor.tokenizeToStrings("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
182   - System.out.println(parsedStrings);
183   - // output: [한국어, 를, 처리, 하는, 예시, 입니, 닼, ㅋㅋㅋㅋㅋ]
184   -
185   - parsed = processor.tokenize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ");
186   - System.out.println(parsed);
187   - // output: [한국어(Noun: 0, 3), 를(Josa: 3, 1), 처리(Noun: 5, 2), 하는(Verb: 7, 2), 예시(Noun: 10, 2), 입니(Adjective: 12, 2), 닼*(Noun: 14, 1), ㅋㅋㅋㅋㅋ(KoreanParticle: 15, 5)]
188   -
189   - List<KoreanPhraseExtractor.KoreanPhrase> phrases = processor
190   - .extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발");
  162 + // Phrase extraction
  163 + List<KoreanPhraseExtractor.KoreanPhrase> phrases = TwitterKoreanProcessorJava.extractPhrases(tokens, true, true);
191 164 System.out.println(phrases);
192   - // output: [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), 시발(Noun: 18, 2)]
  165 + // [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2), #한국어(Hashtag: 18, 4)]
193 166  
194   - processor = new TwitterKoreanProcessorJava.Builder()
195   - .disableNormalizer()
196   - .disableStemmer()
197   - .enablePhraseExtractorSpamFilter()
198   - .build();
199   -
200   - phrases = processor.extractPhrases("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ 시발");
201   - System.out.println(phrases);
202   - // output: [한국어(Noun: 0, 3), 처리(Noun: 5, 2), 처리하는 예시(Noun: 5, 7), 예시(Noun: 10, 2)]
203 167 }
204 168 }
205 169 ```