[npl] / trunk / NationalProblemLibrary / loadDB2 Repository:
ViewVC logotype

View of /trunk/NationalProblemLibrary/loadDB2

Parent Directory Parent Directory | Revision Log Revision Log


Revision 439 - (download) (annotate)
Fri May 25 22:10:52 2007 UTC (5 years, 11 months ago) by jj
File size: 13805 byte(s)
Allow quotes in keywords.

    1 #!/usr/bin/perl
    2 
    3 #use strict;
    4 use File::Find;
    5 use File::Basename;
    6 use Cwd;
    7 use DBI;
    8 
    9 if(! -e "create_tables2.sql") {
   10 	print "Cannot find file create_tables2.sql.\n";
   11 	print "This needs to be run from the directory it was unpacked to,\n";
   12 	print "in which case \"create_tables2.sql\" should be in this directory.\n";
   13 	exit;
   14 }
   15 
   16 my $passwd;
   17 
   18 if(@ARGV != 1) {
   19 	print "What is the root password for mysql? ";
   20 	$passwd = <STDIN>;
   21 	chomp($passwd);
   22 } else {
   23 	$passwd = $ARGV[0];
   24 }
   25 
   26 my $user = 'root';
   27 my $libraryRoot = Cwd::cwd();
   28 my $verbose =0;
   29 my $cnt = 0;
   30 
   31 system("mysql -uroot -p$passwd -B ProblemLibrary <./create_tables2.sql");
   32 print "Mysql database reinitialized\n";
   33 
   34 my $mydb =  "dbi:mysql:ProblemLibrary";
   35 
   36 print "Converting data from tagged pgfiles into mysql.\n";
   37 # From pgfile
   38 ## DBchapter('Limits and Derivatives')
   39 ## DBsection('Calculating Limits using the Limit Laws')
   40 ## Date('6/3/2002')
   41 ## Author('Tangan Gao')
   42 ## Institution('csulb')
   43 ## TitleText1('Calculus Early Transcendentals')
   44 ## EditionText1('4')
   45 ## AuthorText1('Stewart')
   46 ## Section1('2.3')
   47 ## Problem1('7')
   48 #
   49 # The database structure is in the file create_tables2.sql and its ER-graph is 
   50 #    the file wwdb_er_graph.pdf. 
   51 
   52 my ($name,$chapnum,$secnum);
   53 my ($pgfile,$pgpath,@keyword,$tagged,$chapter,$section,$institution,$author,$text,$edition,$date,$textauthor,$textsection,$textproblem);
   54 
   55 #recursive search for all pg files
   56 my $dbh = DBI->connect('dbi:mysql:ProblemLibrary', $user, $passwd);
   57 
   58 $dbh->{RaiseError} = 1; #bail if anything fails via die
   59 
   60 find({ wanted => \&pgfiles, follow_fast=> 1}, $libraryRoot);
   61 
   62 sub kwtidy {
   63 	my $s = shift;
   64 	$s =~ s/\W//g;
   65 	$s =~ s/_//g;
   66 	$s = lc($s);
   67 	return($s);
   68 }
   69 
   70 sub keywordcleaner {
   71 	my $string = shift;
   72 	my @spl1 = split /,/, $string;
   73 	my @spl2 = map(kwtidy($_), @spl1);
   74 	return(@spl2);
   75 }
   76 
   77 sub dbug {
   78     my $msg = shift;
   79     $cnt++;
   80     print $msg if($verbose>1);
   81 #    if($cnt==5000) { print "\n"; $cnt=0;}
   82     print "." if($verbose==1 and ($cnt % 100 == 0));
   83 }
   84 
   85 # process each file returned by the find command.
   86 sub pgfiles {
   87     my $name = $File::Find::name;
   88     my ($subject, $chapter, $section, $date, $institution, $author, $text);
   89     my ($edition, $textauthor, $textsection, $textproblem);
   90     my @textproblems = (-1);
   91     if($name =~ /pg$/) {
   92 	$pgfile = basename($name);
   93 	$pgpath = dirname($name);
   94 	$pgpath =~ s|^$libraryRoot/||;
   95 	open(IN,"$name") or die "can not open $name: $!";
   96 	$tagged = 0;
   97 	while(<IN>) {
   98 	  SWITCH: {
   99 	      if(/KEYWORDS\((.*)\)/i) {
  100 			@keyword = keywordcleaner($1);
  101 		      last SWITCH;
  102 	      }
  103 	      if(/DBsubject\(\s*'(.*?)'\s*\)/) {
  104 		  $subject = $1;
  105 		  $subject =~ s/'/\'/g;
  106 		  last SWITCH;
  107 	      }
  108 	      if(/DBchapter\(\s*'(.*?)'\s*\)/) {
  109 		  $chapter = $1;
  110 		  $chapter =~ s/'/\'/g;
  111 		  $tagged = 1;
  112 		  last SWITCH;
  113 	      }
  114 	      if(/DBsection\(\s*'(.*?)'\s*\)/) {
  115 		  $section = $1;
  116 		  $section =~ s/'/\'/g;
  117 		  last SWITCH;
  118 	      }
  119 	      if(/Date\(\s*'(.*?)'\s*\)/) {
  120 		  $date = $1;
  121 		  $date =~ s/'/\'/g;
  122 		  last SWITCH;
  123 	      }
  124 	      if(/Institution\(\s*'(.*?)'\s*\)/) {
  125 		  $institution = $1;
  126 		  $institution =~ s/'/\'/g;
  127 		  last SWITCH;
  128 	      }
  129 	      if(/Author\(\s*'(.*?)'\s*\)/) {
  130 		  $author = $1;
  131 		  $author =~ s/'/\'/g;
  132 		  last SWITCH;
  133 	      }
  134 	      if(/TitleText1\(\s*'(.*?)'\s*\)/) {
  135 		  $text = $1;
  136 		  $text =~ s/'/\'/g;
  137 		  last SWITCH;
  138 	      }
  139 	      if(/EditionText1\(\s*'(.*?)'\s*\)/)
  140 	      {
  141 		  $edition = $1;
  142 		  $edition =~ s/'/\'/g;
  143 		  last SWITCH;
  144 	      }
  145 	      if(/AuthorText1\(\s*'(.*?)'\s*\)/)
  146 	      {
  147 		  $textauthor = $1;
  148 		  $textauthor =~ s/'/\'/g;
  149 		  last SWITCH;
  150 	      }
  151 	      if(/Section1\(\s*'(.*?)'\s*\)/)
  152 	      {
  153 		  $textsection = $1;
  154 		  $textsection =~ s/'/\'/g;
  155 		  last SWITCH;
  156 	      }
  157 	      if(/Problem1\(\s*'(.*?)'\s*\)/)
  158 	      {
  159 		$textproblem = $1;
  160 		$textproblem =~ s/\D/ /g;
  161 		@textproblems = split /\s+/, $textproblem;
  162 		@textproblems = grep { $_ =~ /\S/ } $textproblem;
  163 		@textproblems = (-1) unless(scalar(@textproblems));
  164 #print "$textproblem\n" if ($textproblem !~ /^[\s\d]+$/);
  165 #		$textproblem =~ s/'/\'/g;
  166 		last SWITCH;
  167 	      }
  168 	    }}#end of SWITCH and while
  169 	if($tagged) {
  170 	    #
  171 	    # kludge to fix the omission of a subject field
  172 	    unless($subject) {
  173 		if($text =~ /precalculus/i) { $subject = "Precalculus" }
  174 		elsif($text =~ /calculus/i) { $subject = "Calculus" }
  175 		elsif($text =~ /linear/i) { $subject = "Linear Algebra" }
  176 		elsif($text =~ /algebra/i) { $subject = "Algebra" }
  177 		elsif($text =~ /statistic/i) { $subject = "Statistics" }
  178 		elsif($text =~ /financial/i) { $subject = "Financial Mathematics" }
  179 		else { $subject = "Misc" }
  180 	    }
  181 	    # From the pgfile we just looked at,
  182 	    ## DBchapter('Limits and Derivatives') 		in $chapter
  183 	    ## DBsection('Calculating Limits using the Limit Laws') in $section
  184 	    ## Date('6/3/2002') 				in $date
  185 	    ## Author('Tangan Gao') 				in $author
  186 	    ## Institution('csulb') 				in $institution
  187 	    ## TitleText1('Calculus Early Transcendentals')	in $text
  188 	    ## EditionText1('4') 				in $edition
  189 	    ## AuthorText1('Stewart') 				in $textauthor
  190 	    ## Section1('2.3') 					in $textsection
  191 	    ## Problem1('7') 					in $textproblem
  192 	    #
  193 	    # The database structure is in the file create_tables2.sql and its ER-graph
  194 	    #    is the file wwdb_er_graph.pdf. Insert, in order, into the tables
  195 	    #    in that file. 
  196 	    #
  197 	    if($textsection =~ /(\d*?)\.(\d*)/) {
  198 			$chapnum = $1;
  199 			$secnum = $2;
  200 	    } else {
  201 			$chapnum = undef;  #no valid number assigned
  202 			$secnum = $textsection;
  203 	    };
  204 
  205 	    #selectrow_array returns first field of first row in scalar context or undef 
  206 	    # undef for failure also, $dbh->{RaiseError} = 1 should catch that case.
  207 	    #
  208 	    # DBsubject table
  209 	    #
  210 	    $query = "SELECT DBsubject_id FROM DBsubject WHERE name = \"$subject\"";
  211 	    my $DBsubject_id = $dbh->selectrow_array($query);  
  212 	    if(!defined($DBsubject_id)){
  213 		$dbh->do(
  214 		    "INSERT INTO DBsubject 
  215 					VALUES(
  216 						\"\",
  217 						\"$subject\"
  218 					)"
  219 		    );
  220 		dbug "INSERT INTO DBsubject VALUES(\"\",\"$subject\")\n";
  221 		$DBsubject_id = $dbh->selectrow_array($query);
  222 	    }
  223 	    
  224 	    # DBchapter table
  225 	    #
  226 	    $query = "SELECT DBchapter_id FROM DBchapter WHERE name = \"$chapter\" and DBsubject_id = $DBsubject_id";
  227 	    my $DBchapter_id = $dbh->selectrow_array($query);  
  228 	    if(!defined($DBchapter_id)){
  229 		$dbh->do("INSERT INTO DBchapter 
  230 					VALUES(
  231 						\"\",
  232 						\"$chapter\",
  233 						\"$DBsubject_id\"
  234 					)"
  235 		    );
  236 		dbug "INSERT INTO DBchapter VALUES(  \"\", \"$chapter\", \"$DBsubject_id\)\n";
  237 		$DBchapter_id = $dbh->selectrow_array($query);
  238 	    }
  239 	    
  240 	    # DBsection table
  241 	    #
  242 	    $query = "SELECT DBsection_id FROM DBsection WHERE name = \"$section\" AND DBchapter_id = $DBchapter_id";
  243 	    my $DBsection_id = $dbh->selectrow_array($query);  
  244 	    if(!defined($DBsection_id)){
  245 		$dbh->do("INSERT INTO DBsection 
  246 					VALUES(
  247 						\"\",
  248 						\"$section\",
  249 						\"$DBchapter_id\"
  250 					)"
  251 		    );
  252 		dbug "INSERT INTO DBsection VALUES( \"\", \"$section\", \"$DBchapter_id\", \"$DBsubject_id\" )\n";
  253 		$DBsection_id = $dbh->selectrow_array($query);
  254 	    }
  255 	    
  256 	    # institution table
  257 	    #
  258 	    $query = "SELECT institution_id FROM institution WHERE name = \"$institution\"";
  259 	    my $institution_id = $dbh->selectrow_array($query);  
  260 	    if(!defined($institution_id)){
  261 		$dbh->do("INSERT INTO institution 
  262 					VALUES(
  263 						\"\",
  264 						\"$institution\",
  265 						\"\",
  266 						\"\",
  267 						\"\",
  268 						\"\",
  269 						\"\",
  270 						\"\"
  271 					)"
  272 		    );
  273 		dbug "INSERT INTO institution VALUES( \"\", \"$institution\", \"\", \"\", \"\", \"\", \"\", \"\")\n";
  274 		$institution_id = $dbh->selectrow_array($query);
  275 	    }
  276 	    
  277 	    # author table
  278 	    #
  279 	    $author =~ /(.*?)\s(\w+)\s*$/;
  280 	    my $firstname = $1;
  281 	    my $lastname = $2;
  282 	    #remove leading and trailing spaces from firstname, which includes any middle name too.
  283 	    $firstname =~ s/^\s*//;
  284 	    $firstname =~ s/\s*$//;
  285 	    $query = "SELECT author_id FROM author WHERE lastname = \"$lastname\" AND firstname=\"$firstname\"";
  286 	    my $author_id = $dbh->selectrow_array($query);  
  287 	    if(!defined($author_id)){
  288 		$dbh->do("INSERT INTO author 
  289 					VALUES(
  290 						\"\",
  291 						\"$institution_id\",
  292 						\"$lastname\",
  293 						\"$firstname\",
  294 						\"\"
  295 					)"
  296 		    );
  297 		dbug "INSERT INTO author VALUES( \"\", \"$institution_id\", \"$lastname\", \"$firstname\", \"\" )\n";
  298 		$author_id = $dbh->selectrow_array($query);
  299 	    }
  300 	    
  301 	    # path table
  302 	    #
  303 	    $query = "SELECT path_id FROM path WHERE path = \"$pgpath\"";
  304 	    my $path_id = $dbh->selectrow_array($query);  
  305 	    if(!defined($path_id)){
  306 		$dbh->do("INSERT INTO path 
  307 					VALUES(
  308 						\"\",
  309 						\"$pgpath\",
  310 						\"\",
  311 						\"\"
  312 					)"
  313 		    );
  314 		dbug "INSERT INTO path VALUES( \"\", \"$path\", \"\", \"\" )\n";
  315 		$path_id = $dbh->selectrow_array($query);
  316 	    }
  317 	    
  318 	    # pgfile table
  319 	    #
  320 	    my $pgfile_id;
  321 	    $dbh->do("INSERT INTO pgfile 
  322 					VALUES(
  323 						\"\",
  324 						\"$DBsection_id\",
  325 						\"$author_id\",
  326 						\"$institution_id\",
  327 						\"$path_id\",
  328 						\"$pgfile\",
  329 						\"\"
  330 					)"
  331 		);
  332 	    dbug "INSERT INTO pgfile VALUES( \"\", \"$DBsection_id\", \"$author_id\", \"$institution_id\", \"$path_id\", \"$pgfile\", \"\" )\n";
  333 	    $query = "SELECT pgfile_id FROM pgfile WHERE filename = \"$pgfile\"";
  334 	    $pgfile_id = $dbh->selectrow_array($query);
  335 	    # keyword table, and problem_keyword many-many table
  336 	    #
  337 	    foreach my $keyword (@keyword)
  338 	    {
  339 		$keyword =~ s/[\'\"]//g;
  340 		$query = "SELECT keyword_id FROM keyword WHERE keyword = \"$keyword\"";
  341 		my $keyword_id = $dbh->selectrow_array($query);  
  342 		if(!defined($keyword_id)){
  343 		    $dbh->do("INSERT INTO keyword 
  344 						VALUES(
  345 							\"\",
  346 							\"$keyword\"
  347 						)"
  348 			);
  349 		    dbug "INSERT INTO keyword VALUES( \"\", \"$keyword\")\n";
  350 		    $keyword_id = $dbh->selectrow_array($query);
  351 		}
  352 		
  353 		$query = "SELECT pgfile_id FROM pgfile_keyword WHERE keyword_id = \"$keyword_id\" and pgfile_id=\"$pgfile_id\"";
  354 		my $ok = $dbh->selectrow_array($query);  
  355 		if(!defined($ok)){
  356 		    $dbh->do("INSERT INTO pgfile_keyword 
  357 						VALUES(
  358 							\"$pgfile_id\",
  359 							\"$keyword_id\"
  360 						)"
  361 			);
  362 		    dbug "INSERT INTO pgfile_keyword VALUES( \"$pgfile_id\", \"$keyword_id\" )\n";
  363 		}
  364 	    } #end foreach keyword
  365 	    
  366 	    # pgfile_institution table
  367 	    #
  368 	    $query = "SELECT institution_id FROM pgfile_institution WHERE pgfile_id = \"$pgfile_id\"";
  369 	    my $pg_inst_id = $dbh->selectrow_array($query);  
  370 	    if(!defined($pg_inst_id)){
  371 		$dbh->do("INSERT INTO pgfile_institution 
  372 					VALUES(
  373 						\"$pgfile_id\",
  374 						\"$institution_id\"
  375 					)"
  376 		    );
  377 		dbug "INSERT INTO pgfile_institution VALUES( \"$pgfile_id\", \"$institution_id\" )\n";
  378 		$pg_inst_id = $dbh->selectrow_array($query);
  379 	    }
  380 	    
  381 	    # textbook table
  382 	    #
  383 	    $query = "SELECT textbook_id FROM textbook WHERE title = \"$text\" AND edition = \"$edition\" AND author=\"$textauthor\"";
  384 	    my $textbook_id = $dbh->selectrow_array($query);  
  385 	    if(!defined($textbook_id)){
  386 		$dbh->do("INSERT INTO textbook 
  387 					VALUES(
  388 						\"\",
  389 						\"$text\",
  390 						\"$edition\",
  391 						\"$textauthor\",
  392 						\"\",
  393 						\"\",
  394 						\"\"
  395 					)"
  396 		    );
  397 		dbug "INSERT INTO textbook VALUES( \"\", \"$text\", \"$edition\", \"$textauthor\", \"\", \"\", \"\" )\n";
  398 		$textbook_id = $dbh->selectrow_array($query);
  399 	    }
  400 
  401 	    # chapter weak table of textbook
  402 	    #
  403 	    $query = "SELECT chapter_id FROM chapter WHERE textbook_id = \"$textbook_id\" AND name = \"$chapter\"";
  404 	    my $chapter_id = $dbh->selectrow_array($query);  
  405 	    if(!defined($chapter_id)){
  406 		$dbh->do("INSERT INTO chapter 
  407 					VALUES(
  408 						\"\",
  409 						\"$textbook_id\",
  410 						\"".$chapnum."\",
  411 						\"$chapter\",
  412 						\"\"
  413 					)"
  414 		    );
  415 		dbug "INSERT INTO chapter VALUES(\"\", \"$textbook_id\", \"".$chapnum."\", \"$chapter\", \"\" )\n";
  416 	        $chapter_id = $dbh->selectrow_array($query);  
  417 	    }
  418 
  419 	    # section weak table of textbook
  420 	    #
  421 	    $query = "SELECT section_id FROM section WHERE chapter_id = \"$chapter_id\" AND name = \"$section\"";
  422 	    my $section_id = $dbh->selectrow_array($query);  
  423 	    if(!defined($section_id)){
  424 		$dbh->do("INSERT INTO section 
  425 					VALUES(
  426 						\"\",
  427 						\"$chapter_id\",
  428 						\"$secnum\",
  429 						\"$section\",
  430 						\"\"
  431 					)"
  432 		    );
  433 		dbug "INSERT INTO section VALUES(\"\", \"$textbook_id\", \"$secnum\", \"$section\", \"\" )\n";
  434 	    	$section_id = $dbh->selectrow_array($query);  
  435 	    }
  436 
  437 	    # problem table contains textbook problems
  438 	    #
  439 	for my $tp (@textproblems) {
  440 	    $query = "SELECT problem_id FROM problem WHERE section_id = \"$section_id\" AND number = \"$tp\"";
  441 	    my $problem_id = $dbh->selectrow_array($query);
  442 	    if(!defined($problem_id)){
  443 		$dbh->do("INSERT INTO problem 
  444 					VALUES(
  445 						\"\",
  446 						\"$section_id\",
  447 						\"$tp\",
  448 						\"\"
  449 					)"
  450 		    );
  451 		dbug "INSERT INTO problem VALUES( \"\", \"$section_id\", \"$tp\", \"\" )\n";
  452 		$problem_id = $dbh->selectrow_array($query);
  453 	    }
  454 
  455 	    # pgfile_problem table associates pgfiles with textbook problems
  456 	    #
  457 	    $query = "SELECT problem_id FROM pgfile_problem WHERE problem_id = \"$problem_id\" AND pgfile_id = \"$pgfile_id\"";
  458 	    my $pg_problem_id = $dbh->selectrow_array($query);  
  459 	    if(!defined($pg_problem_id)){
  460 		$dbh->do("INSERT INTO pgfile_problem 
  461 					VALUES(
  462 						\"$pgfile_id\",
  463 						\"$problem_id\"
  464 					)"
  465 		    );
  466 	        dbug "INSERT INTO pgfile_problem VALUES( \"$pgfile_id\", \"$problem_id\" )\n";
  467 	    }
  468 	   } 
  469 	    #reset tag vars, they may not match the next file
  470 	    $chapter=""; $section=""; $date =""; $author=""; $institution=""; $text=""; $edition=""; $textauthor=""; $textsection=""; $textproblem=""; @textproblems=();
  471 	}
  472 	close(IN) or die "can not close: $!";
  473     }
  474 }
  475 
  476 
  477 $dbh->disconnect;
  478 

aubreyja at gmail dot com
ViewVC Help
Powered by ViewVC 1.0.9