Changeset 85 for trunk/convertdb.rb
- Timestamp:
- 07/07/08 20:39:04 (5 months ago)
- Files:
-
- 1 modified
-
trunk/convertdb.rb (modified) (11 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/convertdb.rb
r79 r85 39 39 end 40 40 41 require 'pathname' 41 42 require 'strscan' 42 43 require 'wordnet' … … 73 74 74 75 # Temporary location for the lexicon data files 75 BuildDir = File::join( File::dirname(__FILE__), File::basename(WordNet::Lexicon::DefaultDbEnv) ) 76 BuildDir = Pathname.new( __FILE__ ).expand_path.dirname + 77 Pathname.new( WordNet::Lexicon::DefaultDbEnv ).basename 76 78 77 79 … … 92 94 # Open the database and check to be sure it's empty. Confirm overwrite if 93 95 # not. Checkpoint and set up logging proc if debugging. 94 if File::exists?( BuildDir )96 if BuildDir.exist? 95 97 message ">>> Warning: Existing data in the Ruby-WordNet databases\n"\ 96 98 "will be overwritten.\n" 97 99 abort( "user cancelled." ) unless 98 100 /^y/i =~ promptWithDefault( "Continue?", "n" ) 99 FileUtils::rm_rf( BuildDir )101 BuildDir.rmtree 100 102 end 101 103 … … 117 119 118 120 # Open the lexicon readwrite into the temporary datadir 119 FileUtils::mkdir( BuildDir )120 lexicon = WordNet::Lexicon::new( BuildDir , 0666 )121 BuildDir.mkpath 122 lexicon = WordNet::Lexicon::new( BuildDir.to_s, 0666 ) 121 123 122 124 # Process each fileset 123 125 [ # Fileset, name, database handle, processor 124 Fileset::new( IndexFiles, "index", lexicon.index_db, method(:parse IndexLine) ),125 Fileset::new( MorphFiles, "morph", lexicon.morph_db, method(:parse MorphLine) ),126 Fileset::new( DataFiles, "data", lexicon.data_db, method(:parse SynsetLine) ),127 ].each {|set|126 Fileset::new( IndexFiles, "index", lexicon.index_db, method(:parse_index_line) ), 127 Fileset::new( MorphFiles, "morph", lexicon.morph_db, method(:parse_morph_line) ), 128 Fileset::new( DataFiles, "data", lexicon.data_db, method(:parse_synset_line) ), 129 ].each do |set| 128 130 message "Converting %s files...\n" % set.name 129 131 set.db.truncate … … 131 133 # Process each file in the set with the appropriate processor method and 132 134 # insert results into the corresponding table. 133 set.files.each {|file,pos|135 set.files.each do |file,pos| 134 136 message " #{file}..." 135 137 … … 142 144 txn, dbh = lexicon.env.txn_begin( 0, set.db ) 143 145 entries = lineNumber = errors = 0 144 File::readlines( filepath ).each {|line|146 File::readlines( filepath ).each do |line| 145 147 lineNumber += 1 146 148 next if /^\s/ =~ line … … 164 166 txn, dbh = lexicon.env.txn_begin( 0, set.db ) 165 167 end 166 } 168 end 169 167 170 message "committing..." 168 171 txn.commit( BDB::TXN_SYNC ) 169 172 message "done (%d entries, %d errors).\n" % 170 173 [ entries, errors ] 171 } 174 end 175 176 lock_stats = lexicon.env.lock_stat 177 message "Lock statistics:\n" 178 puts " Lock objects: #{lock_stats['st_nobjects']}/#{lock_stats['st_maxnobjects']}", 179 " Locks: #{lock_stats['st_nlocks']}/#{lock_stats['st_maxnlocks']}", 180 " Lockers: #{lock_stats['st_nlockers']}/#{lock_stats['st_maxnlockers']}" 181 172 182 173 183 message "Checkpointing DB and cleaning logs..." … … 175 185 lexicon.clean_logs 176 186 puts "done." 177 }187 end 178 188 179 189 message "done.\n\n" … … 191 201 ### +pos+ argument is not used -- it's just to make the interface between all 192 202 ### three processor methods the same. 193 def parse IndexLine( string, lineNumber, pos=nil )203 def parse_index_line( string, lineNumber, pos=nil ) 194 204 $scanner.string = string 195 205 synsets = [] … … 233 243 234 244 ### "Parse" a morph line and return it as a key and value. 235 def parse MorphLine( string, lineNumber, pos )245 def parse_morph_line( string, lineNumber, pos ) 236 246 key, value = string.split 237 247 return "#{key}%#{pos}", value … … 259 269 ### Parse an entry from a data file and return the key and data. Returns +nil+ 260 270 ### if any part of the entry isn't able to be parsed. 261 def parse SynsetLine( string, lineNumber, pos )271 def parse_synset_line( string, lineNumber, pos ) 262 272 $scanner.string = string 263 273
