#!/usr/bin/env perl

# This script will grab information from the GlueX DocDB website in order to
# produce a Bibtex compatible references file (gluex_docs.bib) that contains
# references for every document currently in the DocDB. This automates a process
# that has been done by hand up to now.
#
# Several options exist below which must be set by modifying this script itself
# before running it. Because it downloads over 2000 webpages from the DocDB, there
# are options that allow you to save the downloaded pages to local files so they
# can be used on subsequent invocations. There is no option to update the cache
# with just the differences. You either download them all from the web, or read
# them all from the cache. This is still useful though in case you want to
# tweak the reference before writing it to the output file.
#
# Author names and institution names can be modified globally from how they are
# stored in the DocDB. Also, author lists and document titles can be replaced
# outright using the hashes below. Please see the comments for details.
#
# For questions regarding the GlueX DocDB, please contact:
# Zisis Papandreou <zisis\@uregina.ca>
#
# For questions regarding this script, please contact:
# David Lawrence <davidl@jlab.org>

#<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
#                      Beginning of configurable section
#<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>

# The following two flags can be used to write to or read from a local
# cache of data files. By setting the value of use_local_cache to
# a non-zero value, the local cache directory will be searched for each
# file. If it is not found, the docid will be effectively ignored.
# A non-zero value for save_to_cache will cause the files read from the
# web to be written to a local cache file, but only if use_local_cache
# is zero.
our $use_local_cache = 0;
our $save_to_cache = 1;

# Setting this to non-zero will cause documents whose document type contains
# the string "Talk" to be ignored
our $filter_talks = 0;

# Turn on/off vervbose output
our $verbose = 0;

# Use the author_replace and institution_replace hashes to globally
# modify an author or institution from what is displayed on the DocDB.
# The string used on the left hand side should be exactly as it appears
# on the webpages. The string on the right is what it should be replaced
# with. Note that the first name is always replaced with the first initial
# AFTER any substitutions defined here are made.
our %author_replace;
our %institution_replace;
$author_replace{"Elton Smith"} = "Elton S. Smith";
$institution_replace{"Thomas Jefferson National Accelerator Facility"} = "Jefferson Lab";

# Use author_list_replace and title_replace to set the author list or title
# of specific docids, overriding what was found on the DocDB website. This can be
# for example, to replace an author list with a primary author followed by et al.
# It can also be used to correct titles that are just mis-typed or poorly formed
# in the DocDB. Note that no additional filtering is done on these values so they
# should be well-formed Latex, but with '$' and '\' characters escaped using a
# preceding '\'.
our %author_list_replace;
our %title_replace;
$author_list_replace{19} = "The Hall D Collaboration, R. Clark \\emph{et al.}";
$author_list_replace{44} = "The Hall D Collaboration, R. Clark \\emph{et al.}";
$author_list_replace{58} = "The GlueX Collaboration";
$author_list_replace{842} = "A. Dzierba, Z. Papandreou \\emph{et al.}";
$author_list_replace{1317} = "B. Giebrecht \\emph{et al.}";
$author_list_replace{1701} = "B.D. Leverington \\emph{et al.}";
$author_list_replace{1864} = "Z. Papandreou \\emph{et al.}";

$title_replace{1702} = "Statistics for \$\\pi^{o}\$ and \$\\eta\$ production";
$title_replace{2104} = "Identification of New 5-pion Meson States in the GlueX Experiment using Amplitude Analysis";
$title_replace{2169} = "Low-Energy Tests of  SM and Beyond via Rare $\\eta$ Decays";

# NOTE: There are some other Unicode characters that appear like "&\#64259"
# instead of "ffi" or more commonly "&\#960" for greek letter pi that need
# to be replaced in the titles to make them Latex friendly. These are handled
# below in the subroutine "MakeLatexFriendly". If you encounter one of these
# not already handled, then you'll need to add a line to that subroutine.

#<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
#                         End of configurable section
#<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>

# Make user give us the password for the DocDB writer account so we don't have
# to store it in this file.
our $password;
if($use_local_cache == 0){
	print "\n";
	print "In order to access the DocDB, you must enter the password for the.\n";
	print "writer account. We make you do this here so that it is not hardcoded\n";
	print "in this script which is publically accessible.  This is the standard\n";
	print "password used for the GlueX DocDB.\n";
	print "\n";
	print "password: ";
	$password = <>;
	print "\n";
}

# Get all document ids
our @docids;
&GetAllDocids;

# temporarily truncate list of docids (for debugging only)
#splice @docids, 1;
#@docids = (123,143,284,302);

# Inform user how many doucment ids we found.
print "Found ".scalar(@docids)." documents in DocDB. Retrieving and parsing ....\n";

# Sort docid values numerically
@docids = sort { $a <=> $b } @docids;


# Open output file "gluex_docs.bib" and write header into it
open OFILE, ">gluex_docs.bib";
print OFILE "%% This file generated by the DocDB2Bibtex script\n";
print OFILE "%% by David Lawrence <davidl.at.jlab.org>\n";
print OFILE "%% ".$ENV{"user"}."  ".`date`;
print OFILE "\n\n";

# Loop over docids and get the relevant info for each
$Nwritten = 0;
foreach $docid (@docids){
	if($verbose){print "Getting doc info for $docid ...\n";}
	($authors, $institutions, $title, $month, $year) = &GetDocInfo($docid);

	# Replace author list or title if specified. We wrap the author list in
	# an additional set of curly brackets {} so that LateX will maintain
	# the order (otherwise, it would sort them alphabetically).
	if(exists($author_list_replace{$docid})){$authors = "{".$author_list_replace{$docid}."}";}
	if(exists($title_replace{$docid})){$title = $title_replace{$docid};}
	
	# Only write out this reference if the authors list is not empty
	if(length($authors) != 0){
	
		$str  = "\@techreport{hdnote$docid,\n";
		$str .= "	Author = {$authors},\n";
		$str .= "	Institution = {$institutions},\n";
		$str .= "	Month = $month,\n";
		$str .= "	Note = {\\url{http://argus.phys.uregina.ca/cgi-bin/private/DocDB/ShowDocument?docid=$docid}},\n";
		$str .= "	Number = {GlueX-doc-\\textbf{$docid}},\n";
		$str .= "	Title = {{$title}},\n";
		$str .= "	Year = $year}\n";

		if($verbose){print "$str\n";}

		print OFILE "$str\n";
		$Nwritten ++;
	}
}
print "Generated $Nwritten references from DocDB data.\n";

# Add additional cross references
our $Ncrossreferences;
print OFILE &AddCrossReferences();
$Nwritten += $Ncrossreferences;

# Close gluex_docs.bib
close(OFILE);
print "Wrote $Nwritten references to gluex_docs.bib\n";

# Create a TeX file that cites every single docid so it can be tested.
open OFILE, ">gluex_doc_cite_test.tex";
	$str  ="\\documentclass[11pt]{article}\n";
	$str .="\\usepackage{geometry}\n";
	$str .="\\geometry{letterpaper}\n";
	$str .="\\usepackage{amssymb}\n";
	$str .="\\usepackage{url}\n";
	$str .="\n";
	$str .="\\title{GlueX DocDB References Test}\n";
	$str .="\\author{GlueX Collaboration}\n";

	$str .="\\begin{document}\n";
	$str .="\\maketitle\n";

	foreach $docid (@docids){
		$str .="\\cite{hdnote$docid}\n";
	}
	$str .= "\n";
	$str .= "\\bibliographystyle{unsrt}\n";
	$str .="\n\n\\bibliography{gluex_docs}{}\n\n";
	$str .="\\end{document}\n\n";
	print OFILE $str;

close(OFILE);

# Crate Makefile to generate test PDF
open OFILE, ">Makefile";
	$str  ="\n";
	$str .="all: gluex_doc_cite_test.tex gluex_docs.bib\n";
	$str .="	pdflatex gluex_doc_cite_test.tex\n";
	$str .="	bibtex gluex_doc_cite_test\n";
	$str .="	pdflatex gluex_doc_cite_test.tex\n";
	$str .="	pdflatex gluex_doc_cite_test.tex\n";
	$str .="\n";

	print OFILE $str;
close(OFILE);

# Final message to user
print "\n";
print "A test file containing citations to all references has been generated\n";
print "so the final gluex_docs.bib file can be tested. To build it, type \"make\"\n";
print "or do the following:\n";
print "\n";
print "pdflatex gluex_doc_cite_test.tex\n";
print "bibtex gluex_doc_cite_test\n";
print "pdflatex gluex_doc_cite_test.tex\n";
print "pdflatex gluex_doc_cite_test.tex\n";
print "\n";
print "The references have been written to \"gluex_docs.bib\"\n";

#<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
#                         Subroutines
#<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>

#
# These globally change the name of an author and institution
# from how it is displayed on the DocDB. The author will have
# the first name replaced with just an initial too.

#----------------------------
# AuthorFilter
#----------------------------
sub AuthorFilter
{
	our %author_replace;
	my $author = $_[0];
	
	if(exists($author_replace{"$author"})){
		$author = $author_replace{"$author"};
	}
	
	# If second letter is not a space or s ".", then chop off rest of
	# first name and make just an initial.
	$second_char = substr($author, 1, 1);
	if($second_char  ne ' ' and $second_char ne '.'){
		$author =~ /\s/;
		$author = substr($author, 0, 1).". ".$';
	}

	return $author;
}

#----------------------------
# InstitutionFilter
#----------------------------
sub InstitutionFilter
{
	our %institution_replace;
	my $institution = $_[0];

	if(exists($institution_replace{"$institution"})){
		$institution = $institution_replace{"$institution"};
	}

	return $institution;
}


#
# Get all document ids from the DocDB
#

#----------------------------
# GetAllDocids
#----------------------------
sub GetAllDocids
{
	our $use_local_cache;
	our $save_to_cache;
	our $password;

	our @docids = ();

	# Read list of docids
	if($use_local_cache == 0){

		# Get list from web
		print "Retrieving full document list from server .... \n";
		
		# Get the file FullList.html from the server.
		$fulllist = `curl -u writer:"$password" http://argus.phys.uregina.ca/gluex/DocDB//Static/Lists/writer/FullList.html`;
		@lines = split(/\n/, $fulllist);
		
		if($save_to_cache != 0){
			`mkdir -p SAVED_DOCINFO_FILES`;
			open OFILE, ">SAVED_DOCINFO_FILES/FullList.html";
			print OFILE $fulllist;
			close OFILE;
		}
	
	}else{
		# Get list from cached file
		print "Retrieving full document list from cached file .... \n";
		
		open FILE, "SAVED_DOCINFO_FILES/FullList.html";
		@lines = <FILE>;
		close(FILE);
	}

	# Loop over all lines in the FullList.html file and look for ones
	# describing a document. All we need at the moment is the docid.
	foreach (@lines){
		if(/^\<td class=\"Docid\"\>/){
			if(/docid=/){
				$' =~ /\"/;
				$docid = $`;
				if($docid =~ /\&amp/){$docid=$`;}  # strip any version number
				push @docids, $docid;
			}
		}
	}	
}


#
# Get info for a single, specific document from DocDB
#

#----------------------------
# GetDocInfo
#----------------------------
sub GetDocInfo
{
	our $use_local_cache;
	our $save_to_cache;
	our $filter_talks;
	our $verbose;
	our $password;

	$docid = $_[0];

	# Either grab a locally cached html file or grab it off the web
	if($use_local_cache != 0){
		# Try opening cached file
		$fname = "SAVED_DOCINFO_FILES/ShowDocument".$docid.".html";
		if($verbose){print "Trying $fname ...\n";}
		open SFILE, $fname;
		@lines = <SFILE>;
		close SFILE;
	}else{
		# Grab the document page form the DocDB server
		$html = `curl -u writer:"$password" "http://argus.phys.uregina.ca/cgi-bin/private/DocDB/ShowDocument?docid=$docid"`;

		# If authentication failed, detect it now and exit the program
		if($html =~ /401 Authorization Required/){
			print "\nAuthorization has failed. Maybe you mis-typed the password??\n";
			print "Try re-running. If you still have problems, contact is Zisis\n";
			print "Papandreou (zisis\@uregina.ca) to make sure you have the correct\n";
			print "password.\n";
			exit(-1);
		}

		# Optionally save to local file
		if($save_to_cache != 0){
			`mkdir -p SAVED_DOCINFO_FILES`;
			$fname = "SAVED_DOCINFO_FILES/ShowDocument".$docid.".html";
			if($verbose){print "Writing file: $fname\n";}
			open SFILE, ">$fname";
			print SFILE $html;
			close SFILE;
		}
		@lines = split(/\n/, $html);
	}
	
	$next_line_is_date = 0;
	my %authors;
	my %institutions;
	foreach $line (@lines){
		
		# Get title
		if($line =~ /\<title\>/){
			$line =~ /: /;
			$' =~ /\<\/title\>/;
			$title = $`;
		}

		# Get Document type
		if($line =~ /Document type\:/){
			$line =~ /\"\>/;
			$' =~ /\<\/a\>/;
			$type = $`;
			
			# Filter document types here
			if($filter_talks != 0){
				if($type =~ /(Talk)/){
					if($verbose){print "Ignoring Document Type: $type\n";}
					return ("", "", "", "", "");
				}
			}
		}
		
		# Get revision date
		if($next_line_is_date==1){
			if( $line =~ /\<dd\>/ ){
				@tokens = split(/\s/, $line);
				$month = $tokens[1];
				$year = $tokens[2];
				chop($year);
			}
		}
		if($line =~ /Contents Revised:/){$next_line_is_date=1;}else{$next_line_is_date=0;}
		
		# Get Authors
		if($line =~ /\<div id=\"Authors\"\>/){
			@blocks = split(/authorid=/, $line);
			foreach $block (@blocks){
			
				# Author ID
				$block =~ /^authorid=/;
				$block =~ /\"/;
				$authorid = $`;
				
				# Institution
				$block =~ /title\=\"/;
				$' =~ /\"\>/;
				$author_plus = $'; # keep chunk holding author name for use below
				$institution = $`;
				
				# Author
				$author_plus =~ /\<\/a\>/;
				$author = $`;
				
				chomp($author);
				chomp($Instituion);
				
				# Copy into hash
				if($block !~ /^\<div/){
					$entry = {authorid=>$authorid, author=>$author, institution=>$institution};
					$institutions{"$institution"}++;
					$authors{"$author"}++;
				}
			}
		}
	}
		

	# Make Latex friendly title
	$title = &MakeLatexFriendly($title);
	
	# Make Latex friendly author string
	my $all_authors;
	while( ($k, $v) = each(%authors)){
		if(length($all_authors) > 0){$all_authors .= " and ";}
		$all_authors .= &AuthorFilter($k);
	}
	
	# Make Latex friendly institution string
	my $all_institutions;
	while( ($k, $v) = each(%institutions)){
		if(length($all_institutions) > 0){$all_institutions .= " and ";}
		$all_institutions .= &InstitutionFilter($k);
	}

	return ($all_authors, $all_institutions, $title, $month, $year);
}

#----------------------------
# MakeLatexFriendly
#----------------------------
sub MakeLatexFriendly
{
	# This routine is used to modify a title to be Latex friendly
	# so it won't cause failures when typsetting. The current GlueX
	# DocDB has all kinds of mixtures of Latex, HTML, and ASCII
	# encodings in the titles so numerous filters are applied here
	# to try and get it them all to work. Some explanations are lengthy
	# just due the cryptic nature of perl regexes which can make 
	# understanding some of them very difficult!

	my $title = $_[0];

	# Replace some HTML symbols with Latex
	$title =~ s/\&\#960\;/\$\\pi\$/g;
	$title =~ s/\&\#947\;/\$\\gamma\$/g;
	$title =~ s/\&\#951\;/\$\\eta\$/g;
	$title =~ s/\&\#8594\;/\$\\rightarrow\$/g;
	$title =~ s/\&\#64257\;/fi/g;
	$title =~ s/\&\#64259\;/ffi/g;

	# Escape any underscores and other stuff in title
	$title =~ s/_/\\_/g;  # replace "_" with "\_"
	$title =~ s/\%/\\\%/g;  # replace "%" with "\%"
	$title =~ s/gp\-\>/\$\\gamma p\\rightarrow\$/g; # replace "gp->" with "$\gamma p\rightarrow$"
	$title =~ s/\-\>/\$\\rightarrow\$/g; # replace "->" with "$\rightarrow$"
	$title =~ s/(\s+)\&/\1\\\&/g; # replace " &" with " \&"
	$title =~ s/\\pyth\{\}/PYTHIA/g; # replace "\pyth{}" with "PYTHIA"
	$title =~ s/\v//g;  # vertical tabs?? (shows up in docid=1468)
	$title =~ s/(\b\S+)\&(\S+\b)/\1\\\&\2/g; # replace " X&Y " with " X\&Y " (e.g. R&D)
	if($title !~ /\$\\eta\\pi/){ # exclude the mixed up title for docid=1443
		$title =~ s/([^\$])\\pi/\1\$\\pi\$/g;  # replace "\pi" with "$\pi$"
	}

	# The following will try and identify instances of things like
	# "J^PC" and replace then with "$J^{PC}$ as well as
	# "1^+-" with "$1^{+-}$ and "mm^2" with "$mm^{2}$. Here
	# is the explanation:
	#
	# Search pattern:  ([^\s=]+)\^([^\s=]+)
	#  The round brackets are used for remembering the matches
	#  so they can be referred to as \1 and \2 in the replacement
	#  pattern. The [^\s=] means to match any character that is
	#  NOT a white space or "=". Thus, the whole search pattern
	#  matches to any number of non-white-space,non-equals-sign
	#  characters, followed by a caret (^) and then followed by
	#  any number of non-white-space,non-equals-sign characters.
	#  (n.b. "any number" above means "1 or more".)
	#
	# Replacement pattern: \$\1^{\2}\$
	#  This just writes out the first set of characters matched
	#  in the pattern (those to the left of the "^"), then a
	#  "^", followed by the second match wrapped in curly
	#  brackets {}. The whole things is also wrapped in $ $
	#  characters.
	#
	#  The if statement blocks use of this if the line already has
	#  a "$" character in it. The assumption being that it has
	#  already been made Latex friendly.
	if($title =~ /\^/ and $title !~ /\$/){
		$title =~ s/([^\s=]+)\^([^\s=]+)/\$\1^{\2}\$/g;
	}

	return $title;
}

#----------------------------
# AddCrossReferences
#----------------------------
sub AddCrossReferences
{
	# These are cross-references found in the original gluex_docs.bib

	@crossreferences;
	push @crossreferences, '@techreport{curtis-note-7,  Crossref = {hdnote7}}';
	push @crossreferences, '@techreport{curtis-note-14, Crossref = {hdnote14}}';
	push @crossreferences, '@techreport{curtis-note-15, Crossref = {hdnote15}}';
	push @crossreferences, '@techreport{drv2,           Crossref = {hdnote19}}';
	push @crossreferences, '@techreport{Keller2,        Crossref = {hdnote32}}';
	push @crossreferences, '@techreport{Keller1,        Crossref = {hdnote33}}';
	push @crossreferences, '@techreport{mcnicoll-00,    Crossref = {hdnote36}}';
	push @crossreferences, '@techreport{radphidamage,   Crossref = {hdnote43}}';
	push @crossreferences, '@techreport{drv3,           Crossref = {hdnote44}}';
	push @crossreferences, '@techreport{kleinb,         Crossref = {hdnote333}}';

	$str = "\n";
	foreach (@crossreferences){ $str .= "$_\n\n"; }
	
	our $Ncrossreferences = scalar(@crossreferences);
	
	print "Adding $Ncrossreferences cross-references ...\n";

	return $str;

}