#!/usr/bin/env perl # This script will grab information from the GlueX DocDB website in order to # produce a Bibtex compatible references file (gluex_docs.bib) that contains # references for every document currently in the DocDB. This automates a process # that has been done by hand up to now. # # Several options exist below which must be set by modifying this script itself # before running it. Because it downloads over 2000 webpages from the DocDB, there # are options that allow you to save the downloaded pages to local files so they # can be used on subsequent invocations. There is no option to update the cache # with just the differences. You either download them all from the web, or read # them all from the cache. This is still useful though in case you want to # tweak the reference before writing it to the output file. # # Author names and institution names can be modified globally from how they are # stored in the DocDB. Also, author lists and document titles can be replaced # outright using the hashes below. Please see the comments for details. # # For questions regarding the GlueX DocDB, please contact: # Zisis Papandreou # # For questions regarding this script, please contact: # David Lawrence #<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> # Beginning of configurable section #<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> # The following two flags can be used to write to or read from a local # cache of data files. By setting the value of use_local_cache to # a non-zero value, the local cache directory will be searched for each # file. If it is not found, the docid will be effectively ignored. # A non-zero value for save_to_cache will cause the files read from the # web to be written to a local cache file, but only if use_local_cache # is zero. our $use_local_cache = 0; our $save_to_cache = 1; # Setting this to non-zero will cause documents whose document type contains # the string "Talk" to be ignored our $filter_talks = 0; # Turn on/off vervbose output our $verbose = 0; # Use the author_replace and institution_replace hashes to globally # modify an author or institution from what is displayed on the DocDB. # The string used on the left hand side should be exactly as it appears # on the webpages. The string on the right is what it should be replaced # with. Note that the first name is always replaced with the first initial # AFTER any substitutions defined here are made. our %author_replace; our %institution_replace; $author_replace{"Elton Smith"} = "Elton S. Smith"; $institution_replace{"Thomas Jefferson National Accelerator Facility"} = "Jefferson Lab"; # Use author_list_replace and title_replace to set the author list or title # of specific docids, overriding what was found on the DocDB website. This can be # for example, to replace an author list with a primary author followed by et al. # It can also be used to correct titles that are just mis-typed or poorly formed # in the DocDB. Note that no additional filtering is done on these values so they # should be well-formed Latex, but with '$' and '\' characters escaped using a # preceding '\'. our %author_list_replace; our %title_replace; $author_list_replace{19} = "The Hall D Collaboration, R. Clark \\emph{et al.}"; $author_list_replace{44} = "The Hall D Collaboration, R. Clark \\emph{et al.}"; $author_list_replace{58} = "The GlueX Collaboration"; $author_list_replace{842} = "A. Dzierba, Z. Papandreou \\emph{et al.}"; $author_list_replace{1317} = "B. Giebrecht \\emph{et al.}"; $author_list_replace{1701} = "B.D. Leverington \\emph{et al.}"; $author_list_replace{1864} = "Z. Papandreou \\emph{et al.}"; $title_replace{1702} = "Statistics for \$\\pi^{o}\$ and \$\\eta\$ production"; $title_replace{2104} = "Identification of New 5-pion Meson States in the GlueX Experiment using Amplitude Analysis"; $title_replace{2169} = "Low-Energy Tests of SM and Beyond via Rare $\\eta$ Decays"; # NOTE: There are some other Unicode characters that appear like "&\#64259" # instead of "ffi" or more commonly "&\#960" for greek letter pi that need # to be replaced in the titles to make them Latex friendly. These are handled # below in the subroutine "MakeLatexFriendly". If you encounter one of these # not already handled, then you'll need to add a line to that subroutine. #<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> # End of configurable section #<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> # Make user give us the password for the DocDB writer account so we don't have # to store it in this file. our $password; if($use_local_cache == 0){ print "\n"; print "In order to access the DocDB, you must enter the password for the.\n"; print "writer account. We make you do this here so that it is not hardcoded\n"; print "in this script which is publically accessible. This is the standard\n"; print "password used for the GlueX DocDB.\n"; print "\n"; print "password: "; $password = <>; print "\n"; } # Get all document ids our @docids; &GetAllDocids; # temporarily truncate list of docids (for debugging only) #splice @docids, 1; #@docids = (123,143,284,302); # Inform user how many doucment ids we found. print "Found ".scalar(@docids)." documents in DocDB. Retrieving and parsing ....\n"; # Sort docid values numerically @docids = sort { $a <=> $b } @docids; # Open output file "gluex_docs.bib" and write header into it open OFILE, ">gluex_docs.bib"; print OFILE "%% This file generated by the DocDB2Bibtex script\n"; print OFILE "%% by David Lawrence \n"; print OFILE "%% ".$ENV{"user"}." ".`date`; print OFILE "\n\n"; # Loop over docids and get the relevant info for each $Nwritten = 0; foreach $docid (@docids){ if($verbose){print "Getting doc info for $docid ...\n";} ($authors, $institutions, $title, $month, $year) = &GetDocInfo($docid); # Replace author list or title if specified. We wrap the author list in # an additional set of curly brackets {} so that LateX will maintain # the order (otherwise, it would sort them alphabetically). if(exists($author_list_replace{$docid})){$authors = "{".$author_list_replace{$docid}."}";} if(exists($title_replace{$docid})){$title = $title_replace{$docid};} # Only write out this reference if the authors list is not empty if(length($authors) != 0){ $str = "\@techreport{hdnote$docid,\n"; $str .= " Author = {$authors},\n"; $str .= " Institution = {$institutions},\n"; $str .= " Month = $month,\n"; $str .= " Note = {\\url{http://argus.phys.uregina.ca/cgi-bin/private/DocDB/ShowDocument?docid=$docid}},\n"; $str .= " Number = {GlueX-doc-\\textbf{$docid}},\n"; $str .= " Title = {{$title}},\n"; $str .= " Year = $year}\n"; if($verbose){print "$str\n";} print OFILE "$str\n"; $Nwritten ++; } } print "Generated $Nwritten references from DocDB data.\n"; # Add additional cross references our $Ncrossreferences; print OFILE &AddCrossReferences(); $Nwritten += $Ncrossreferences; # Close gluex_docs.bib close(OFILE); print "Wrote $Nwritten references to gluex_docs.bib\n"; # Create a TeX file that cites every single docid so it can be tested. open OFILE, ">gluex_doc_cite_test.tex"; $str ="\\documentclass[11pt]{article}\n"; $str .="\\usepackage{geometry}\n"; $str .="\\geometry{letterpaper}\n"; $str .="\\usepackage{amssymb}\n"; $str .="\\usepackage{url}\n"; $str .="\n"; $str .="\\title{GlueX DocDB References Test}\n"; $str .="\\author{GlueX Collaboration}\n"; $str .="\\begin{document}\n"; $str .="\\maketitle\n"; foreach $docid (@docids){ $str .="\\cite{hdnote$docid}\n"; } $str .= "\n"; $str .= "\\bibliographystyle{unsrt}\n"; $str .="\n\n\\bibliography{gluex_docs}{}\n\n"; $str .="\\end{document}\n\n"; print OFILE $str; close(OFILE); # Crate Makefile to generate test PDF open OFILE, ">Makefile"; $str ="\n"; $str .="all: gluex_doc_cite_test.tex gluex_docs.bib\n"; $str .=" pdflatex gluex_doc_cite_test.tex\n"; $str .=" bibtex gluex_doc_cite_test\n"; $str .=" pdflatex gluex_doc_cite_test.tex\n"; $str .=" pdflatex gluex_doc_cite_test.tex\n"; $str .="\n"; print OFILE $str; close(OFILE); # Final message to user print "\n"; print "A test file containing citations to all references has been generated\n"; print "so the final gluex_docs.bib file can be tested. To build it, type \"make\"\n"; print "or do the following:\n"; print "\n"; print "pdflatex gluex_doc_cite_test.tex\n"; print "bibtex gluex_doc_cite_test\n"; print "pdflatex gluex_doc_cite_test.tex\n"; print "pdflatex gluex_doc_cite_test.tex\n"; print "\n"; print "The references have been written to \"gluex_docs.bib\"\n"; #<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> # Subroutines #<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> # # These globally change the name of an author and institution # from how it is displayed on the DocDB. The author will have # the first name replaced with just an initial too. #---------------------------- # AuthorFilter #---------------------------- sub AuthorFilter { our %author_replace; my $author = $_[0]; if(exists($author_replace{"$author"})){ $author = $author_replace{"$author"}; } # If second letter is not a space or s ".", then chop off rest of # first name and make just an initial. $second_char = substr($author, 1, 1); if($second_char ne ' ' and $second_char ne '.'){ $author =~ /\s/; $author = substr($author, 0, 1).". ".$'; } return $author; } #---------------------------- # InstitutionFilter #---------------------------- sub InstitutionFilter { our %institution_replace; my $institution = $_[0]; if(exists($institution_replace{"$institution"})){ $institution = $institution_replace{"$institution"}; } return $institution; } # # Get all document ids from the DocDB # #---------------------------- # GetAllDocids #---------------------------- sub GetAllDocids { our $use_local_cache; our $save_to_cache; our $password; our @docids = (); # Read list of docids if($use_local_cache == 0){ # Get list from web print "Retrieving full document list from server .... \n"; # Get the file FullList.html from the server. $fulllist = `curl -u writer:"$password" http://argus.phys.uregina.ca/gluex/DocDB//Static/Lists/writer/FullList.html`; @lines = split(/\n/, $fulllist); if($save_to_cache != 0){ `mkdir -p SAVED_DOCINFO_FILES`; open OFILE, ">SAVED_DOCINFO_FILES/FullList.html"; print OFILE $fulllist; close OFILE; } }else{ # Get list from cached file print "Retrieving full document list from cached file .... \n"; open FILE, "SAVED_DOCINFO_FILES/FullList.html"; @lines = ; close(FILE); } # Loop over all lines in the FullList.html file and look for ones # describing a document. All we need at the moment is the docid. foreach (@lines){ if(/^\/){ if(/docid=/){ $' =~ /\"/; $docid = $`; if($docid =~ /\&/){$docid=$`;} # strip any version number push @docids, $docid; } } } } # # Get info for a single, specific document from DocDB # #---------------------------- # GetDocInfo #---------------------------- sub GetDocInfo { our $use_local_cache; our $save_to_cache; our $filter_talks; our $verbose; our $password; $docid = $_[0]; # Either grab a locally cached html file or grab it off the web if($use_local_cache != 0){ # Try opening cached file $fname = "SAVED_DOCINFO_FILES/ShowDocument".$docid.".html"; if($verbose){print "Trying $fname ...\n";} open SFILE, $fname; @lines = ; close SFILE; }else{ # Grab the document page form the DocDB server $html = `curl -u writer:"$password" "http://argus.phys.uregina.ca/cgi-bin/private/DocDB/ShowDocument?docid=$docid"`; # If authentication failed, detect it now and exit the program if($html =~ /401 Authorization Required/){ print "\nAuthorization has failed. Maybe you mis-typed the password??\n"; print "Try re-running. If you still have problems, contact is Zisis\n"; print "Papandreou (zisis\@uregina.ca) to make sure you have the correct\n"; print "password.\n"; exit(-1); } # Optionally save to local file if($save_to_cache != 0){ `mkdir -p SAVED_DOCINFO_FILES`; $fname = "SAVED_DOCINFO_FILES/ShowDocument".$docid.".html"; if($verbose){print "Writing file: $fname\n";} open SFILE, ">$fname"; print SFILE $html; close SFILE; } @lines = split(/\n/, $html); } $next_line_is_date = 0; my %authors; my %institutions; foreach $line (@lines){ # Get title if($line =~ /\/){ $line =~ /: /; $' =~ /\<\/title\>/; $title = $`; } # Get Document type if($line =~ /Document type\:/){ $line =~ /\"\>/; $' =~ /\<\/a\>/; $type = $`; # Filter document types here if($filter_talks != 0){ if($type =~ /(Talk)/){ if($verbose){print "Ignoring Document Type: $type\n";} return ("", "", "", "", ""); } } } # Get revision date if($next_line_is_date==1){ if( $line =~ /\/ ){ @tokens = split(/\s/, $line); $month = $tokens[1]; $year = $tokens[2]; chop($year); } } if($line =~ /Contents Revised:/){$next_line_is_date=1;}else{$next_line_is_date=0;} # Get Authors if($line =~ /\
/){ @blocks = split(/authorid=/, $line); foreach $block (@blocks){ # Author ID $block =~ /^authorid=/; $block =~ /\"/; $authorid = $`; # Institution $block =~ /title\=\"/; $' =~ /\"\>/; $author_plus = $'; # keep chunk holding author name for use below $institution = $`; # Author $author_plus =~ /\<\/a\>/; $author = $`; chomp($author); chomp($Instituion); # Copy into hash if($block !~ /^\
$authorid, author=>$author, institution=>$institution}; $institutions{"$institution"}++; $authors{"$author"}++; } } } } # Make Latex friendly title $title = &MakeLatexFriendly($title); # Make Latex friendly author string my $all_authors; while( ($k, $v) = each(%authors)){ if(length($all_authors) > 0){$all_authors .= " and ";} $all_authors .= &AuthorFilter($k); } # Make Latex friendly institution string my $all_institutions; while( ($k, $v) = each(%institutions)){ if(length($all_institutions) > 0){$all_institutions .= " and ";} $all_institutions .= &InstitutionFilter($k); } return ($all_authors, $all_institutions, $title, $month, $year); } #---------------------------- # MakeLatexFriendly #---------------------------- sub MakeLatexFriendly { # This routine is used to modify a title to be Latex friendly # so it won't cause failures when typsetting. The current GlueX # DocDB has all kinds of mixtures of Latex, HTML, and ASCII # encodings in the titles so numerous filters are applied here # to try and get it them all to work. Some explanations are lengthy # just due the cryptic nature of perl regexes which can make # understanding some of them very difficult! my $title = $_[0]; # Replace some HTML symbols with Latex $title =~ s/\&\#960\;/\$\\pi\$/g; $title =~ s/\&\#947\;/\$\\gamma\$/g; $title =~ s/\&\#951\;/\$\\eta\$/g; $title =~ s/\&\#8594\;/\$\\rightarrow\$/g; $title =~ s/\&\#64257\;/fi/g; $title =~ s/\&\#64259\;/ffi/g; # Escape any underscores and other stuff in title $title =~ s/_/\\_/g; # replace "_" with "\_" $title =~ s/\%/\\\%/g; # replace "%" with "\%" $title =~ s/gp\-\>/\$\\gamma p\\rightarrow\$/g; # replace "gp->" with "$\gamma p\rightarrow$" $title =~ s/\-\>/\$\\rightarrow\$/g; # replace "->" with "$\rightarrow$" $title =~ s/(\s+)\&/\1\\\&/g; # replace " &" with " \&" $title =~ s/\\pyth\{\}/PYTHIA/g; # replace "\pyth{}" with "PYTHIA" $title =~ s/\v//g; # vertical tabs?? (shows up in docid=1468) $title =~ s/(\b\S+)\&(\S+\b)/\1\\\&\2/g; # replace " X&Y " with " X\&Y " (e.g. R&D) if($title !~ /\$\\eta\\pi/){ # exclude the mixed up title for docid=1443 $title =~ s/([^\$])\\pi/\1\$\\pi\$/g; # replace "\pi" with "$\pi$" } # The following will try and identify instances of things like # "J^PC" and replace then with "$J^{PC}$ as well as # "1^+-" with "$1^{+-}$ and "mm^2" with "$mm^{2}$. Here # is the explanation: # # Search pattern: ([^\s=]+)\^([^\s=]+) # The round brackets are used for remembering the matches # so they can be referred to as \1 and \2 in the replacement # pattern. The [^\s=] means to match any character that is # NOT a white space or "=". Thus, the whole search pattern # matches to any number of non-white-space,non-equals-sign # characters, followed by a caret (^) and then followed by # any number of non-white-space,non-equals-sign characters. # (n.b. "any number" above means "1 or more".) # # Replacement pattern: \$\1^{\2}\$ # This just writes out the first set of characters matched # in the pattern (those to the left of the "^"), then a # "^", followed by the second match wrapped in curly # brackets {}. The whole things is also wrapped in $ $ # characters. # # The if statement blocks use of this if the line already has # a "$" character in it. The assumption being that it has # already been made Latex friendly. if($title =~ /\^/ and $title !~ /\$/){ $title =~ s/([^\s=]+)\^([^\s=]+)/\$\1^{\2}\$/g; } return $title; } #---------------------------- # AddCrossReferences #---------------------------- sub AddCrossReferences { # These are cross-references found in the original gluex_docs.bib @crossreferences; push @crossreferences, '@techreport{curtis-note-7, Crossref = {hdnote7}}'; push @crossreferences, '@techreport{curtis-note-14, Crossref = {hdnote14}}'; push @crossreferences, '@techreport{curtis-note-15, Crossref = {hdnote15}}'; push @crossreferences, '@techreport{drv2, Crossref = {hdnote19}}'; push @crossreferences, '@techreport{Keller2, Crossref = {hdnote32}}'; push @crossreferences, '@techreport{Keller1, Crossref = {hdnote33}}'; push @crossreferences, '@techreport{mcnicoll-00, Crossref = {hdnote36}}'; push @crossreferences, '@techreport{radphidamage, Crossref = {hdnote43}}'; push @crossreferences, '@techreport{drv3, Crossref = {hdnote44}}'; push @crossreferences, '@techreport{kleinb, Crossref = {hdnote333}}'; $str = "\n"; foreach (@crossreferences){ $str .= "$_\n\n"; } our $Ncrossreferences = scalar(@crossreferences); print "Adding $Ncrossreferences cross-references ...\n"; return $str; }