# creates a file of word pairs from a simple thesaurus # works on a plain text file in which all non-blank lines # must have exactly two fields (words) in them: a wordpair file # the output is a thesaurus file in which synonyms are sorted # note: the input file does not have to be sorted, therefore # this is not exactly an inverse to the related program # ths2prs.py since that program produces a sorted file # also in ths2prs the synonyms on each line do not have # to be sorted # # usage: python prs2ths.py wordpair-file thesaurus-file # # note: written to be read and maintained, not to be # short and sweet -- as all good programs should be # # prs2ths.py is Copyright (C) 2010 Douglas Pardoe Wilson # This program comes with ABSOLUTELY NO WARRANTY # This is free software, and you are welcome to redistribute it # subject to the conditions of the GNU General Public License (GPL) import sys import string comlineargs = sys.argv argcount = len(comlineargs) # print "command line argument count was " + `argcount` # print "command line: " # print comlineargs if len(comlineargs) != 3: print "usage: python ths2prs.py thesaurus-infile wordpair-outfile" else: infile = open(comlineargs[1], "r") outfile = open(comlineargs[2], "w") wordpairlines = list() while 1: # loop indefinitely, but will break out when no line found thisline = infile.readline() if thisline: pairline = string.strip(thisline) pair = string.split(pairline) if len(pair) != 2: print "error, not a pair ", thisline break wordpairlines.append(pairline) else: break infile.close() wordpairlines.sort() print len(wordpairlines), " lines read" outlinecount = 0 wordpairs = list() if len(wordpairlines) > 0: for pairline in wordpairlines: pair = string.split(pairline) wordpairs.append(pair) lastlemma = "" outstring = "" for pair in wordpairs: if len(pair) != 2: print "error, not a pair ", pairline break lemma = pair[0] if len(lemma) < 1: print "error, null lemma" break synonym = pair[1] # print lastlemma, lemma if lemma != lastlemma: if len(outstring) > 0: justchecking = string.split(outstring) if len(justchecking) < 2: print "error, output less than a pair: ", outstring break outfile.write(outstring + "\n") # note: writes syns for previous lemma outlinecount = outlinecount + 1 lastlemma = lemma outstring = lemma + " " + synonym else: outstring = outstring + " " + synonym outfile.write(outstring) # must write syns for last lemma here outlinecount = outlinecount + 1 outfile.close() print `outlinecount`, " lines written"