in ctakes-smoking-status/src/main/java/org/apache/ctakes/smokingstatus/MLutil/GenerateTrainingData.java [70:178]
public void makeFeatures(String fname) {
String str = "", cls = "", sen = "";
try {
BufferedReader fin = new BufferedReader(new FileReader(fname));
while ((str = fin.readLine()) != null) {
if (str.length() == 0)
continue;
StringTokenizer strTok = new StringTokenizer(str, "|");
while (strTok.hasMoreTokens()) {
sen = strTok.nextToken().trim();
cls = strTok.nextToken().trim();
}
// System.out.println(sen);
// assign class label
if (cls.toLowerCase().startsWith("p"))
cls = Const.CLASS_PAST_SMOKER;
else if (cls.toLowerCase().startsWith("c"))
cls = Const.CLASS_CURR_SMOKER;
else if (cls.toLowerCase().startsWith("s"))
cls = Const.CLASS_SMOKER;
else {
System.out.println("Undefined class label:" + cls);
System.exit(1);
}
sen = sen.toLowerCase().replaceAll("[.?!:;()',\"{}<>#+]", " ")
.trim();
sen = sen.toLowerCase().replaceAll("-{2,}", " ").trim();
String[] senTokens = sen.split("\\s");
List<String> unigrams = new ArrayList();
List bigrams = new ArrayList();
for (int i = 0; i < senTokens.length; i++)
if (!stopwords.contains(senTokens[i])
&& senTokens[i].trim().length() > 0)
unigrams.add(senTokens[i]);
for (int i = 0; i < unigrams.size() - 1; i++)
bigrams.add((String) unigrams.get(i) + "_"
+ (String) unigrams.get(i + 1));
List<Comparable> feature = new ArrayList();
// make binary keywords vector
Iterator<String> itr = keywords.iterator();
while (itr.hasNext()) {
String k = (String) itr.next();
int val = 0;
// bigram
if (k.indexOf("_") != -1) {
for (int i = 0; i < bigrams.size(); i++) {
if (k.equalsIgnoreCase((String) bigrams.get(i))) {
val = 1;
break;
}
}
}
// uigram
else {
for (int i = 0; i < unigrams.size(); i++) {
if (k.equalsIgnoreCase((String) unigrams.get(i))) {
val = 1;
break;
}
}
}
feature.add(val);
}
// date feature - naive feature
if (true) {
int hasYear = 0;
for (int i = 0; i < unigrams.size(); i++) {
String s = (String) unigrams.get(i);
// updated Apr-9-2009
if (s.matches("19\\d\\d") || s.matches("19\\d\\ds")
|| s.matches("20\\d\\d")
|| s.matches("20\\d\\ds")
|| s.matches("[1-9]0s")
|| s.matches("\\d{1,2}[/-]\\d{1,2}")
|| s.matches("\\d{1,2}[/-]\\d{4}")
|| s.matches("\\d{1,2}[/-]\\d{1,2}[/-]\\d{2}")
|| s.matches("\\d{1,2}[/-]\\d{1,2}[/-]\\d{4}")) {
hasYear = 1;
break;
}
}
feature.add(hasYear);
}
// add class label
feature.add(cls);
// add feature to the feature set
features.add(feature);
}
fin.close();
} catch (Exception e) {
e.printStackTrace();
}
}