Parent Directory
|
Revision Log
Allow quotes in keywords.
1 #!/usr/bin/perl 2 3 #use strict; 4 use File::Find; 5 use File::Basename; 6 use Cwd; 7 use DBI; 8 9 if(! -e "create_tables2.sql") { 10 print "Cannot find file create_tables2.sql.\n"; 11 print "This needs to be run from the directory it was unpacked to,\n"; 12 print "in which case \"create_tables2.sql\" should be in this directory.\n"; 13 exit; 14 } 15 16 my $passwd; 17 18 if(@ARGV != 1) { 19 print "What is the root password for mysql? "; 20 $passwd = <STDIN>; 21 chomp($passwd); 22 } else { 23 $passwd = $ARGV[0]; 24 } 25 26 my $user = 'root'; 27 my $libraryRoot = Cwd::cwd(); 28 my $verbose =0; 29 my $cnt = 0; 30 31 system("mysql -uroot -p$passwd -B ProblemLibrary <./create_tables2.sql"); 32 print "Mysql database reinitialized\n"; 33 34 my $mydb = "dbi:mysql:ProblemLibrary"; 35 36 print "Converting data from tagged pgfiles into mysql.\n"; 37 # From pgfile 38 ## DBchapter('Limits and Derivatives') 39 ## DBsection('Calculating Limits using the Limit Laws') 40 ## Date('6/3/2002') 41 ## Author('Tangan Gao') 42 ## Institution('csulb') 43 ## TitleText1('Calculus Early Transcendentals') 44 ## EditionText1('4') 45 ## AuthorText1('Stewart') 46 ## Section1('2.3') 47 ## Problem1('7') 48 # 49 # The database structure is in the file create_tables2.sql and its ER-graph is 50 # the file wwdb_er_graph.pdf. 51 52 my ($name,$chapnum,$secnum); 53 my ($pgfile,$pgpath,@keyword,$tagged,$chapter,$section,$institution,$author,$text,$edition,$date,$textauthor,$textsection,$textproblem); 54 55 #recursive search for all pg files 56 my $dbh = DBI->connect('dbi:mysql:ProblemLibrary', $user, $passwd); 57 58 $dbh->{RaiseError} = 1; #bail if anything fails via die 59 60 find({ wanted => \&pgfiles, follow_fast=> 1}, $libraryRoot); 61 62 sub kwtidy { 63 my $s = shift; 64 $s =~ s/\W//g; 65 $s =~ s/_//g; 66 $s = lc($s); 67 return($s); 68 } 69 70 sub keywordcleaner { 71 my $string = shift; 72 my @spl1 = split /,/, $string; 73 my @spl2 = map(kwtidy($_), @spl1); 74 return(@spl2); 75 } 76 77 sub dbug { 78 my $msg = shift; 79 $cnt++; 80 print $msg if($verbose>1); 81 # if($cnt==5000) { print "\n"; $cnt=0;} 82 print "." if($verbose==1 and ($cnt % 100 == 0)); 83 } 84 85 # process each file returned by the find command. 86 sub pgfiles { 87 my $name = $File::Find::name; 88 my ($subject, $chapter, $section, $date, $institution, $author, $text); 89 my ($edition, $textauthor, $textsection, $textproblem); 90 my @textproblems = (-1); 91 if($name =~ /pg$/) { 92 $pgfile = basename($name); 93 $pgpath = dirname($name); 94 $pgpath =~ s|^$libraryRoot/||; 95 open(IN,"$name") or die "can not open $name: $!"; 96 $tagged = 0; 97 while(<IN>) { 98 SWITCH: { 99 if(/KEYWORDS\((.*)\)/i) { 100 @keyword = keywordcleaner($1); 101 last SWITCH; 102 } 103 if(/DBsubject\(\s*'(.*?)'\s*\)/) { 104 $subject = $1; 105 $subject =~ s/'/\'/g; 106 last SWITCH; 107 } 108 if(/DBchapter\(\s*'(.*?)'\s*\)/) { 109 $chapter = $1; 110 $chapter =~ s/'/\'/g; 111 $tagged = 1; 112 last SWITCH; 113 } 114 if(/DBsection\(\s*'(.*?)'\s*\)/) { 115 $section = $1; 116 $section =~ s/'/\'/g; 117 last SWITCH; 118 } 119 if(/Date\(\s*'(.*?)'\s*\)/) { 120 $date = $1; 121 $date =~ s/'/\'/g; 122 last SWITCH; 123 } 124 if(/Institution\(\s*'(.*?)'\s*\)/) { 125 $institution = $1; 126 $institution =~ s/'/\'/g; 127 last SWITCH; 128 } 129 if(/Author\(\s*'(.*?)'\s*\)/) { 130 $author = $1; 131 $author =~ s/'/\'/g; 132 last SWITCH; 133 } 134 if(/TitleText1\(\s*'(.*?)'\s*\)/) { 135 $text = $1; 136 $text =~ s/'/\'/g; 137 last SWITCH; 138 } 139 if(/EditionText1\(\s*'(.*?)'\s*\)/) 140 { 141 $edition = $1; 142 $edition =~ s/'/\'/g; 143 last SWITCH; 144 } 145 if(/AuthorText1\(\s*'(.*?)'\s*\)/) 146 { 147 $textauthor = $1; 148 $textauthor =~ s/'/\'/g; 149 last SWITCH; 150 } 151 if(/Section1\(\s*'(.*?)'\s*\)/) 152 { 153 $textsection = $1; 154 $textsection =~ s/'/\'/g; 155 last SWITCH; 156 } 157 if(/Problem1\(\s*'(.*?)'\s*\)/) 158 { 159 $textproblem = $1; 160 $textproblem =~ s/\D/ /g; 161 @textproblems = split /\s+/, $textproblem; 162 @textproblems = grep { $_ =~ /\S/ } $textproblem; 163 @textproblems = (-1) unless(scalar(@textproblems)); 164 #print "$textproblem\n" if ($textproblem !~ /^[\s\d]+$/); 165 # $textproblem =~ s/'/\'/g; 166 last SWITCH; 167 } 168 }}#end of SWITCH and while 169 if($tagged) { 170 # 171 # kludge to fix the omission of a subject field 172 unless($subject) { 173 if($text =~ /precalculus/i) { $subject = "Precalculus" } 174 elsif($text =~ /calculus/i) { $subject = "Calculus" } 175 elsif($text =~ /linear/i) { $subject = "Linear Algebra" } 176 elsif($text =~ /algebra/i) { $subject = "Algebra" } 177 elsif($text =~ /statistic/i) { $subject = "Statistics" } 178 elsif($text =~ /financial/i) { $subject = "Financial Mathematics" } 179 else { $subject = "Misc" } 180 } 181 # From the pgfile we just looked at, 182 ## DBchapter('Limits and Derivatives') in $chapter 183 ## DBsection('Calculating Limits using the Limit Laws') in $section 184 ## Date('6/3/2002') in $date 185 ## Author('Tangan Gao') in $author 186 ## Institution('csulb') in $institution 187 ## TitleText1('Calculus Early Transcendentals') in $text 188 ## EditionText1('4') in $edition 189 ## AuthorText1('Stewart') in $textauthor 190 ## Section1('2.3') in $textsection 191 ## Problem1('7') in $textproblem 192 # 193 # The database structure is in the file create_tables2.sql and its ER-graph 194 # is the file wwdb_er_graph.pdf. Insert, in order, into the tables 195 # in that file. 196 # 197 if($textsection =~ /(\d*?)\.(\d*)/) { 198 $chapnum = $1; 199 $secnum = $2; 200 } else { 201 $chapnum = undef; #no valid number assigned 202 $secnum = $textsection; 203 }; 204 205 #selectrow_array returns first field of first row in scalar context or undef 206 # undef for failure also, $dbh->{RaiseError} = 1 should catch that case. 207 # 208 # DBsubject table 209 # 210 $query = "SELECT DBsubject_id FROM DBsubject WHERE name = \"$subject\""; 211 my $DBsubject_id = $dbh->selectrow_array($query); 212 if(!defined($DBsubject_id)){ 213 $dbh->do( 214 "INSERT INTO DBsubject 215 VALUES( 216 \"\", 217 \"$subject\" 218 )" 219 ); 220 dbug "INSERT INTO DBsubject VALUES(\"\",\"$subject\")\n"; 221 $DBsubject_id = $dbh->selectrow_array($query); 222 } 223 224 # DBchapter table 225 # 226 $query = "SELECT DBchapter_id FROM DBchapter WHERE name = \"$chapter\" and DBsubject_id = $DBsubject_id"; 227 my $DBchapter_id = $dbh->selectrow_array($query); 228 if(!defined($DBchapter_id)){ 229 $dbh->do("INSERT INTO DBchapter 230 VALUES( 231 \"\", 232 \"$chapter\", 233 \"$DBsubject_id\" 234 )" 235 ); 236 dbug "INSERT INTO DBchapter VALUES( \"\", \"$chapter\", \"$DBsubject_id\)\n"; 237 $DBchapter_id = $dbh->selectrow_array($query); 238 } 239 240 # DBsection table 241 # 242 $query = "SELECT DBsection_id FROM DBsection WHERE name = \"$section\" AND DBchapter_id = $DBchapter_id"; 243 my $DBsection_id = $dbh->selectrow_array($query); 244 if(!defined($DBsection_id)){ 245 $dbh->do("INSERT INTO DBsection 246 VALUES( 247 \"\", 248 \"$section\", 249 \"$DBchapter_id\" 250 )" 251 ); 252 dbug "INSERT INTO DBsection VALUES( \"\", \"$section\", \"$DBchapter_id\", \"$DBsubject_id\" )\n"; 253 $DBsection_id = $dbh->selectrow_array($query); 254 } 255 256 # institution table 257 # 258 $query = "SELECT institution_id FROM institution WHERE name = \"$institution\""; 259 my $institution_id = $dbh->selectrow_array($query); 260 if(!defined($institution_id)){ 261 $dbh->do("INSERT INTO institution 262 VALUES( 263 \"\", 264 \"$institution\", 265 \"\", 266 \"\", 267 \"\", 268 \"\", 269 \"\", 270 \"\" 271 )" 272 ); 273 dbug "INSERT INTO institution VALUES( \"\", \"$institution\", \"\", \"\", \"\", \"\", \"\", \"\")\n"; 274 $institution_id = $dbh->selectrow_array($query); 275 } 276 277 # author table 278 # 279 $author =~ /(.*?)\s(\w+)\s*$/; 280 my $firstname = $1; 281 my $lastname = $2; 282 #remove leading and trailing spaces from firstname, which includes any middle name too. 283 $firstname =~ s/^\s*//; 284 $firstname =~ s/\s*$//; 285 $query = "SELECT author_id FROM author WHERE lastname = \"$lastname\" AND firstname=\"$firstname\""; 286 my $author_id = $dbh->selectrow_array($query); 287 if(!defined($author_id)){ 288 $dbh->do("INSERT INTO author 289 VALUES( 290 \"\", 291 \"$institution_id\", 292 \"$lastname\", 293 \"$firstname\", 294 \"\" 295 )" 296 ); 297 dbug "INSERT INTO author VALUES( \"\", \"$institution_id\", \"$lastname\", \"$firstname\", \"\" )\n"; 298 $author_id = $dbh->selectrow_array($query); 299 } 300 301 # path table 302 # 303 $query = "SELECT path_id FROM path WHERE path = \"$pgpath\""; 304 my $path_id = $dbh->selectrow_array($query); 305 if(!defined($path_id)){ 306 $dbh->do("INSERT INTO path 307 VALUES( 308 \"\", 309 \"$pgpath\", 310 \"\", 311 \"\" 312 )" 313 ); 314 dbug "INSERT INTO path VALUES( \"\", \"$path\", \"\", \"\" )\n"; 315 $path_id = $dbh->selectrow_array($query); 316 } 317 318 # pgfile table 319 # 320 my $pgfile_id; 321 $dbh->do("INSERT INTO pgfile 322 VALUES( 323 \"\", 324 \"$DBsection_id\", 325 \"$author_id\", 326 \"$institution_id\", 327 \"$path_id\", 328 \"$pgfile\", 329 \"\" 330 )" 331 ); 332 dbug "INSERT INTO pgfile VALUES( \"\", \"$DBsection_id\", \"$author_id\", \"$institution_id\", \"$path_id\", \"$pgfile\", \"\" )\n"; 333 $query = "SELECT pgfile_id FROM pgfile WHERE filename = \"$pgfile\""; 334 $pgfile_id = $dbh->selectrow_array($query); 335 # keyword table, and problem_keyword many-many table 336 # 337 foreach my $keyword (@keyword) 338 { 339 $keyword =~ s/[\'\"]//g; 340 $query = "SELECT keyword_id FROM keyword WHERE keyword = \"$keyword\""; 341 my $keyword_id = $dbh->selectrow_array($query); 342 if(!defined($keyword_id)){ 343 $dbh->do("INSERT INTO keyword 344 VALUES( 345 \"\", 346 \"$keyword\" 347 )" 348 ); 349 dbug "INSERT INTO keyword VALUES( \"\", \"$keyword\")\n"; 350 $keyword_id = $dbh->selectrow_array($query); 351 } 352 353 $query = "SELECT pgfile_id FROM pgfile_keyword WHERE keyword_id = \"$keyword_id\" and pgfile_id=\"$pgfile_id\""; 354 my $ok = $dbh->selectrow_array($query); 355 if(!defined($ok)){ 356 $dbh->do("INSERT INTO pgfile_keyword 357 VALUES( 358 \"$pgfile_id\", 359 \"$keyword_id\" 360 )" 361 ); 362 dbug "INSERT INTO pgfile_keyword VALUES( \"$pgfile_id\", \"$keyword_id\" )\n"; 363 } 364 } #end foreach keyword 365 366 # pgfile_institution table 367 # 368 $query = "SELECT institution_id FROM pgfile_institution WHERE pgfile_id = \"$pgfile_id\""; 369 my $pg_inst_id = $dbh->selectrow_array($query); 370 if(!defined($pg_inst_id)){ 371 $dbh->do("INSERT INTO pgfile_institution 372 VALUES( 373 \"$pgfile_id\", 374 \"$institution_id\" 375 )" 376 ); 377 dbug "INSERT INTO pgfile_institution VALUES( \"$pgfile_id\", \"$institution_id\" )\n"; 378 $pg_inst_id = $dbh->selectrow_array($query); 379 } 380 381 # textbook table 382 # 383 $query = "SELECT textbook_id FROM textbook WHERE title = \"$text\" AND edition = \"$edition\" AND author=\"$textauthor\""; 384 my $textbook_id = $dbh->selectrow_array($query); 385 if(!defined($textbook_id)){ 386 $dbh->do("INSERT INTO textbook 387 VALUES( 388 \"\", 389 \"$text\", 390 \"$edition\", 391 \"$textauthor\", 392 \"\", 393 \"\", 394 \"\" 395 )" 396 ); 397 dbug "INSERT INTO textbook VALUES( \"\", \"$text\", \"$edition\", \"$textauthor\", \"\", \"\", \"\" )\n"; 398 $textbook_id = $dbh->selectrow_array($query); 399 } 400 401 # chapter weak table of textbook 402 # 403 $query = "SELECT chapter_id FROM chapter WHERE textbook_id = \"$textbook_id\" AND name = \"$chapter\""; 404 my $chapter_id = $dbh->selectrow_array($query); 405 if(!defined($chapter_id)){ 406 $dbh->do("INSERT INTO chapter 407 VALUES( 408 \"\", 409 \"$textbook_id\", 410 \"".$chapnum."\", 411 \"$chapter\", 412 \"\" 413 )" 414 ); 415 dbug "INSERT INTO chapter VALUES(\"\", \"$textbook_id\", \"".$chapnum."\", \"$chapter\", \"\" )\n"; 416 $chapter_id = $dbh->selectrow_array($query); 417 } 418 419 # section weak table of textbook 420 # 421 $query = "SELECT section_id FROM section WHERE chapter_id = \"$chapter_id\" AND name = \"$section\""; 422 my $section_id = $dbh->selectrow_array($query); 423 if(!defined($section_id)){ 424 $dbh->do("INSERT INTO section 425 VALUES( 426 \"\", 427 \"$chapter_id\", 428 \"$secnum\", 429 \"$section\", 430 \"\" 431 )" 432 ); 433 dbug "INSERT INTO section VALUES(\"\", \"$textbook_id\", \"$secnum\", \"$section\", \"\" )\n"; 434 $section_id = $dbh->selectrow_array($query); 435 } 436 437 # problem table contains textbook problems 438 # 439 for my $tp (@textproblems) { 440 $query = "SELECT problem_id FROM problem WHERE section_id = \"$section_id\" AND number = \"$tp\""; 441 my $problem_id = $dbh->selectrow_array($query); 442 if(!defined($problem_id)){ 443 $dbh->do("INSERT INTO problem 444 VALUES( 445 \"\", 446 \"$section_id\", 447 \"$tp\", 448 \"\" 449 )" 450 ); 451 dbug "INSERT INTO problem VALUES( \"\", \"$section_id\", \"$tp\", \"\" )\n"; 452 $problem_id = $dbh->selectrow_array($query); 453 } 454 455 # pgfile_problem table associates pgfiles with textbook problems 456 # 457 $query = "SELECT problem_id FROM pgfile_problem WHERE problem_id = \"$problem_id\" AND pgfile_id = \"$pgfile_id\""; 458 my $pg_problem_id = $dbh->selectrow_array($query); 459 if(!defined($pg_problem_id)){ 460 $dbh->do("INSERT INTO pgfile_problem 461 VALUES( 462 \"$pgfile_id\", 463 \"$problem_id\" 464 )" 465 ); 466 dbug "INSERT INTO pgfile_problem VALUES( \"$pgfile_id\", \"$problem_id\" )\n"; 467 } 468 } 469 #reset tag vars, they may not match the next file 470 $chapter=""; $section=""; $date =""; $author=""; $institution=""; $text=""; $edition=""; $textauthor=""; $textsection=""; $textproblem=""; @textproblems=(); 471 } 472 close(IN) or die "can not close: $!"; 473 } 474 } 475 476 477 $dbh->disconnect; 478
| aubreyja at gmail dot com | ViewVC Help |
| Powered by ViewVC 1.0.9 |