public void makeFeatures()

in ctakes-smoking-status/src/main/java/org/apache/ctakes/smokingstatus/MLutil/GenerateTrainingData.java [70:178]


	public void makeFeatures(String fname) {
		String str = "", cls = "", sen = "";

		try {
			BufferedReader fin = new BufferedReader(new FileReader(fname));
			while ((str = fin.readLine()) != null) {
				if (str.length() == 0)
					continue;

				StringTokenizer strTok = new StringTokenizer(str, "|");

				while (strTok.hasMoreTokens()) {
					sen = strTok.nextToken().trim();
					cls = strTok.nextToken().trim();
				}

				// System.out.println(sen);

				// assign class label
				if (cls.toLowerCase().startsWith("p"))
					cls = Const.CLASS_PAST_SMOKER;
				else if (cls.toLowerCase().startsWith("c"))
					cls = Const.CLASS_CURR_SMOKER;
				else if (cls.toLowerCase().startsWith("s"))
					cls = Const.CLASS_SMOKER;
				else {
					System.out.println("Undefined class label:" + cls);
					System.exit(1);
				}

				sen = sen.toLowerCase().replaceAll("[.?!:;()',\"{}<>#+]", " ")
						.trim();
				sen = sen.toLowerCase().replaceAll("-{2,}", " ").trim();

				String[] senTokens = sen.split("\\s");
				List<String> unigrams = new ArrayList();
				List bigrams = new ArrayList();

				for (int i = 0; i < senTokens.length; i++)
					if (!stopwords.contains(senTokens[i])
							&& senTokens[i].trim().length() > 0)
						unigrams.add(senTokens[i]);

				for (int i = 0; i < unigrams.size() - 1; i++)
					bigrams.add((String) unigrams.get(i) + "_"
							+ (String) unigrams.get(i + 1));

				List<Comparable> feature = new ArrayList();

				// make binary keywords vector
				Iterator<String> itr = keywords.iterator();
				while (itr.hasNext()) {
					String k = (String) itr.next();
					int val = 0;

					// bigram
					if (k.indexOf("_") != -1) {
						for (int i = 0; i < bigrams.size(); i++) {
							if (k.equalsIgnoreCase((String) bigrams.get(i))) {
								val = 1;
								break;
							}
						}
					}
					// uigram
					else {
						for (int i = 0; i < unigrams.size(); i++) {
							if (k.equalsIgnoreCase((String) unigrams.get(i))) {
								val = 1;
								break;
							}
						}
					}

					feature.add(val);
				}

				// date feature - naive feature
				if (true) {
					int hasYear = 0;
					for (int i = 0; i < unigrams.size(); i++) {
						String s = (String) unigrams.get(i);
						// updated Apr-9-2009
						if (s.matches("19\\d\\d") || s.matches("19\\d\\ds")
								|| s.matches("20\\d\\d")
								|| s.matches("20\\d\\ds")
								|| s.matches("[1-9]0s")
								|| s.matches("\\d{1,2}[/-]\\d{1,2}")
								|| s.matches("\\d{1,2}[/-]\\d{4}")
								|| s.matches("\\d{1,2}[/-]\\d{1,2}[/-]\\d{2}")
								|| s.matches("\\d{1,2}[/-]\\d{1,2}[/-]\\d{4}")) {
							hasYear = 1;
							break;
						}
					}
					feature.add(hasYear);
				}

				// add class label
				feature.add(cls);

				// add feature to the feature set
				features.add(feature);
			}
			fin.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}