#!/usr/local/bin/python # # text2data.py # parse .. into a SVMlight format and dictionary. # $Id: text2data.py,v 1.5 2023/02/12 16:54:17 daichi Exp $ # import re import sys import regex import MeCab from eprint import eprintf from collections import defaultdict tagger = MeCab.Tagger ('-Owakati') def save_data (file, input, dic): eprintf ('writing data to %s.. ' % file) with open (file, 'w') as oh: with open (input, 'r') as fh: for line in fh: if re.search (r'^ 0: for word in text: word = re.sub (r'[0-9]', '#', word) if (word in dic): id = dic[word] freq[id] += 1 first = True for id,count in sorted (freq.items(), key=lambda x: x[1], reverse=True): oh.write ('%s%d:%d' % (('' if first else ' '), id, count)) first = False oh.write ('\n') else: words = tagger.parse (line.rstrip('\n')).split() text += words eprintf ('done.\n', clear=False) def save_dic (file, dic): eprintf ('writing dic to %s.. ' % file) with open (file, 'w') as oh: for word,id in sorted (dic.items(), key=lambda x: x[1]): oh.write ('%d\t%s\n' % (id, word)) eprintf ('done.\n', clear=False) def parse (file, dic, output): save_dic (output + '.lex', dic) save_data (output + '.dat', file, dic) def vocabulary (file, threshold): freq = defaultdict (int) dic = {} id = 0 # parse with open (file, 'r') as fh: for line in fh: if re.search (r'^ 0: for word in text: word = re.sub (r'[0-9]', '#', word) freq[word] += 1 else: words = tagger.parse (line.rstrip('\n')).split() text += words # register for word,count in sorted (freq.items(), key=lambda x: x[1], reverse=True): if (count >= threshold) and isvalid(word): id += 1 dic[word] = id return dic def isvalid (s): if regex.search (r'^\p{Hiragana}+$', s): return False else: return regex.search (r'(\p{Han}|\p{Hiragana}|\p{Katakana}|\p{alnum})', s) def usage (): print ('usage: % text2data.py train.txt output{.dat,lex} [threshold]') print ('$Id: text2data.py,v 1.5 2023/02/12 16:54:17 daichi Exp $') sys.exit (0) def main (): if len(sys.argv) < 3: usage () else: text = sys.argv[1] output = sys.argv[2] threshold = int (sys.argv[3]) if len(sys.argv) > 3 else 10 dic = vocabulary (text, threshold) parse (text, dic, output) if __name__ == "__main__": main ()