--- core/src/dualist/pipes/DocumentPipe.java.orig 2012-02-11 05:07:28.000000000 +0900
+++ core/src/dualist/pipes/DocumentPipe.java 2012-02-22 22:44:45.000000000 +0900
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.types.Instance;
+import dualist.pipes.SimpleMecabPipe;
public class DocumentPipe extends Pipe {
private Pipe myPipe = new SerialPipes(new Pipe[] {
new CharSequenceReplace(Pattern.compile("&(.*?);"), ""),
new CharSequenceReplace(Pattern.compile("[0-9]+"), "00"),
new CharSequenceLowercase(),
+ (System.getProperty("dualist.lang") != null &&
+ System.getProperty("dualist.lang").equals("ja")) ?
+ new SimpleMecabPipe() :
// new CharSequence2TokenSequence(CharSequenceLexer.LEX_WORD_CLASSES),
new CharSequence2TokenSequence("[\\p{L}\\p{Mn}]+"),
new TokenSequenceRemoveStopwords(),
--- build.xml.orig 2012-03-08 23:07:56.000000000 +0900
+++ build.xml 2012-03-09 09:32:14.000000000 +0900
<mkdir dir="${classes.dir}"/>
<!-- <javac srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/> -->
- <javac debug="true" debuglevel="lines,vars,source" srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/>
+ <javac debug="true" debuglevel="lines,vars,source" srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath" encoding="UTF-8"/>
<target name="jar" depends="compile">