@string(ai="Artificial Intelligence")
@string(byte = "BYTE")
@string(can="Computer Architecture News")
@string(complang="Computer Languages")
@string(ieeemicro="IEEE Micro")
@string{ieeecomputer="Computer"}
@string(jfar="Journal of Forth Application and Research")
@string(sigart="SIGART Newsletter")
@string(sigforth="SigForth Newsletter")
@string{sigplan="SIGPLAN Notices"}
@string{spe="Software---Practice and Experience"}
@string(iclp3 = "Logic Programming: Third International Conference")
@string(iclp6 = "Logic Programming: Sixth International Conference")

@incollection(peirera84,
author="Peirera, L. M.",
title="Logic Control with Logic",
booktitle="Implementations of Prolog",
editor="Campbell, J. A.",
publisher="Ellis-Horwood",
year="1984",
pages="177--193"
)

@mastersthesis(pichler85,
author="Christian Pichler",
title="{Prolog-\"{U}bersetzer}",
school="{Technische Universit\"{a}t Wien}",
year="1985"
)

@book(pangratz86,
author="H. Pangratz",
title="{Logische Schaltkreise}",
publisher="{Technische Universit\"at Wien, Institut f\"{u}r
Datenverarbeitung}",
year="1986"
)

@book(vanhentenryck89,
Author 	= "Van Hentenryck, Pascal",
Title	= "{Constraint Satisfaction in Logic Programming}",
Series	= "{Logic Programming Series}",
Year	= "1989",
Publisher= "MIT Press",
Address	= "Cambridge, Massachusetts"
)

@phdthesis(krall88,
author="Andreas Krall",
title="{Analyse und Implementierung von Prolog-Systemen}",
school="{Technische Universit\"{a}t Wien}",
year="1988"
)

@inproceedings(jaffar&lassez87b,
author="Joxan Jaffar and Jean-Louis Lassez",
title="From Unification to Constraints",
booktitle="Logic Programming '87",
year="1987",
editor="K. Furukawa and H. Tanaka and T. Fujisaki",
pages="1--18",
publisher="Springer LNCS 315"
)

@inproceedings(cs-prolog,
author="Toshio Kawamura and Hayato Ohwada and Fumio Mizoguchi",
title="{CS-Prolog}: A Generalized Unification Based Constraint Solver",
booktitle="Logic Programming '87",
year="1987",
editor="K. Furukawa and H. Tanaka and T. Fujisaki",
pages="19--39",
publisher="Springer LNCS 315"
)

@inproceedings(dechter&pearl88,
author="R. Dechter and J. Pearl",
title="A Problem Simplification Approach that Generates Heuristics for
Constraint-Satisfaction Problems",
booktitle="Machine Intelligence 11",
year="1988",
editor="J. E. Hayes and D. Michie and J. Richards",
publisher="Oxford University Press"
)

@article(haralick&elliot80,
author="Robert M. Haralick and Gordon L. Elliot",
title="Increasing Tree Search Efficiency for Constraint Satisfaction
Problems",
journal=ai,
year="1980",
volume="14",
pages="263--313"
)

@inproceedings(jaffar&lassez87a,
author="Joxan Jaffar and Jean-Louis Lassez",
title="Constraint Logic Programming",
booktitle="Fourteenth Annual {ACM} Symposium on Principles of
Programming Languages (POPL)",
year="1987",
pages="111--119",
address="M{\"u}nchen"
)

@article(sussman&steele80,
author="Gerald Jay Sussman and Guy Lewis {Steele Jr.}",
title="CONSTRAINTS---A Language for Expressing Almost-Hierarchical
Descriptions",
journal=ai,
year="1980",
volume="14",
pages="1--39"
)

@article(rossi88,
author="Francesca Rossi",
title="Constraint Satisfaction Problems in Logic Programming",
journal=sigart,
year="1988",
number="106",
pages="24--28",
month=oct
)

@article(freeman-benson+90,
author="Bjorn N. Freeman-Benson and John Maloney and Alan Borning",
title="An Incremental Constraint Solver",
journal=cacm,
year="1990",
volume="33",
number="1",
pages="54--63",
month=jan
)

@inproceedings(carlsson87,
author="Mats Carlsson",
title="Freeze, Indexing and Other Implementation Issues in the {WAM}",
crossref="iclp87",
pages="40--58"
)

@book(boizumault88,
author="Patrice Boizumault",
title="Prolog: L'implantation",
publisher="Masson",
year="1988",
address="Paris"
)

@book(vancaneghem86,
author="Van Caneghem, Michel",
title="L'anatomie de Prolog",
publisher="InterEditions",
year="1986",
address="Paris"
)

@inproceedings(boizumault86,
author="Patrice Boizumault",
title="A General Model to Implement {\tt dif} and {\tt freeze}",
booktitle=iclp3,
year="1986",
pages="585--592",
address="London",
publisher="Springer LNCS 225"
)

@techreport(gabriel+85,
author="John Gabriel and Tim Lindholm and E. L. Lusk and R. A.
Overbeek",
title="A Tutorial on the {Warren Abstract Machine} for Computational Logic",
institution="Argonne National Laboratory",
year="1985",
number="ANL-84-84"
)

@inproceedings(touati&despain87,
author="Herv\'{e} Touati and Alvin Despain",
title="An Empirical Study of the Warren Abstract Machine",
booktitle="1987 Symposion on Logic Programming",
year="1987",
pages="114--124",
organization="IEEE"
)

@book(neumann88,
author="Gustaf Neumann",
title="Metaprogrammierung und Prolog",
publisher="Addison-Wesley",
year="1988",
series="Internationale Computer-Bibliothek",
address="Bonn"
)

@incollection(mellish82,
author="C. S. Mellish",
title="An Alternative to Structure Sharing in the Implementation of a
{Prolog} Interpreter",
booktitle="Logic Programming",
publisher="Academic Press",
year="1982",
editor="K. L. Clark and S.-A. T{\"a}rnlund",
pages="99--106",
address="London"
)

@inproceedings(bruynooghe86,
author="Maurice Bruynooghe and Danny de Schreye and Bruno Krekels",
title="Compiling Control",
booktitle="1986 Symposium on Logic Programming",
year="1986",
pages="70--77",
organization="IEEE"
)

@article(cohen90,
author="Jaques Cohen",
title="Constraint Logic Programming Languages",
journal=cacm,
year="1990",
volume="33",
number="7",
pages="52--68",
month=jul
)

@article(colmerauer90,
author="Alain Colmerauer",
title="An Introduction to {Prolog III}",
journal=cacm,
year="1990",
volume="33",
number="7",
pages="69--90",
month=jul
)

@article(freuder78,
author="Eugene C. Freuder",
title="Synthesizing Constraint Expressions",
journal=cacm,
year="1978",
volume="21",
number="11",
pages="958--966",
month=nov
)

@inproceedings(beringer&porcher89,
author="Henri Beringer and Franck Porcher",
title="A Relevant Scheme for {Prolog} Extensions: {CLP(Conceptual
Theory)}",
booktitle=iclp6,
year=1989,
OPTeditor="Giorgio Levi and Maurizio Martelli",
pages="131--148",
address="Lissabon"
)

@inproceedings(borning+89,
author="Alan Borning and Michael Maher and Amy Martindale and Molly Wilson",
title="Constraint Hierarchies and Logic Programming",
booktitle=iclp6,
year=1989,
OPTeditor="Giorgio Levi and Maurizio Martelli",
pages="149--164",
address="Lissabon"
)

@inproceedings(vanhentenryck89b,
author="Van Hentenryck, Pascal",
title="Parallel Constraint Satisfaction in Logic Programming:
Preliminary Results of CHIP within PEPSys",
booktitle=iclp6,
year=1989,
OPTeditor="Giorgio Levi and Maurizio Martelli",
pages="165--180",
address="Lissabon"
)

@inproceedings(walinsky89,
author="Clifford Walinsky",
title="{CLP($\Sigma^*$)}: Constraint Logic Programming with Regular Sets",
booktitle=iclp6,
year=1989,
OPTeditor="Giorgio Levi and Maurizio Martelli",
pages="181--196",
address="Lissabon"
)

@book(lloyd84,
author="John Wylie Lloyd",
title="Foundataions of Logic Programming",
publisher="Springer-Verlag",
year="1984",
address="Berlin"
)

@InProceedings(Dincbas+88,
Author	= "Dincbas, Mehmet and Van Hentenryck, Pascal and Simonis, Helmut and Aggoun, Abder and Graf, Thomas and Berthier, Fran\c{c}oise",
Title	="{The Constraint Logic Programming Language CHIP}",
BookTitle="{International Conference on
Fifth Generation Computer Systems (FGCS)}",
Address="Tokyo",
Month=dec,
Year="1988"
)

@article(cuadrado85,
author="Clara Y. Cuadrado and John L. Cuadrado",
title="Prolog Goes to Work",
journal=byte,
year="1985",
volume="10",
number="8",
pages="151--158",
month=aug
)

@article(kowalski85,
author="Robert Kowalski",
title="Logic Programming",
journal=byte,
year="1985",
volume="10",
number="8",
pages="161--177",
month=aug
)

@article(darlington85,
author="John Darlington",
title="Program Transformation",
journal=byte,
year="1985",
volume="10",
number="8",
pages="201--216",
month=aug
)

@manual(heintze+86,
title="The {CLP} Programmer's Manual, Version 1.0",
author="Nevin Heintze and Joxan Jaffar and Chean Shen Lim and Spiro
Michaylov and Peter Stuckey and Roland Yap and Chut Ngeow Yee",
organization="Department of Computer Science, Monash University",
address="Australia",
year="1986"
)

@inproceedings(saraswat87,
author="Vijay A. Saraswat",
title="{CP} as a General-Purpose Constraint-Language",
booktitle=aaai87,
year="1987",
pages="53--58"
)

@Proceedings{aaai87,
  key =		"AAAI-87",
  booktitle = 	"{Sixth National Conference on Artificial Intelligence
		 (AAAI)}",
  title = 	"{Sixth National Conference on Artificial Intelligence
		 (AAAI)}",
  year = 	"1987",
}

@inproceedings(mcallester90,
author="David McAllester",
title="Truth Maintenance",
booktitle=aaai90,
year="1990",
pages="1109--1116"
)

@Proceedings{aaai90,
  key =		"AAAI-90",
  booktitle = 	"{Ninth National Conference on Artificial Intelligence
		 (AAAI-90)}",
  title = 	"{Ninth National Conference on Artificial Intelligence
		 (AAAI-90)}",
  year = 	"1990",
}

@article(clocksin87,
author="William Clocksin",
title="A {Prolog} Primer",
journal=byte,
year="1987",
volume="12",
number="9",
pages="147--158",
month=aug
)

@article(lassez87,
author="Catherine Lassez",
title="Constraint Logic Programming",
journal=byte,
year="1987",
volume="12",
number="9",
pages="171--176",
month=aug
)

@article(colmerauer87,
author="Alain Colmerauer",
title="Opening the {Prolog~III} Universe",
journal=byte,
year="1987",
volume="12",
number="9",
pages="177--182",
month=aug
)

@book(naish86,
author="Lee Naish",
title="Negation and Control in {Prolog}",
publisher="Springer LNCS~238",
year="1986",
)

@book(giannesini+86,
author="Fran\c{c}is Giannesini and Henry Kanoui and Robert Pasero and
Michel van Caneghem",
title="{Prolog}",
publisher="Addison-Wesley",
year="1986",
series="International Computer Science Series"
)

@inproceedings(seki&furukawa87,
author="Hirohisa Seki and Koichi Furukawa",
title="Notes on Transformation Techniques for Generate and Test Logic
Programs",
booktitle="1987 Symposion on Logic Programming",
year="1987",
pages="215--223",
organization="IEEE"
)

@incollection(clark+82,
author="K. L. Clark and F. G. McCabe and S. Gregory",
title="{IC-Prolog} Language Features",
booktitle="Logic Programming",
publisher="Academic Press",
year="1982",
editor="K. L. Clark and S.-A. T{\"a}rnlund",
pages="253--266",
address="London"
)

@mastersthesis(knaus88,
author="Bernhard J. Knaus",
title="{Prolog Meta-Interpreter}",
school="{Technische Universit\"{a}t Wien}",
year="1988"
)

@book(tick88,
author="Evan Tick",
title="Memory Performance of {Prolog} Architectures",
publisher="Kluwer Academic Publishers",
year="1988",
address="Boston"
)

@article(kowalski79,
author="Robert Kowalski",
title="Algorithm = Logic + Control",
journal=cacm,
year="1979",
volume="22",
number="7",
pages="424--436",
month=jul
)

@mastersthesis(neumerkel89,
author="Ulrich Neumerkel",
title="{Speicherbereinigung f\"ur Prologsysteme}",
school="{Technische Universit\"{a}t Wien}",
year="1989"
)

@mastersthesis(slany89,
author="Wolfgang Slany",
title="{Optimierung relationaler Anfragen am Beispiel der ARTHUR
Implementierung}",
school="{Technische Universit\"{a}t Wien}",
year="1989"
)

@phdthesis(holzbaur90,
author="Christian Holzbaur",
title="Implementation of Constraint Based Inference Mechanisms through
Extended Unification",
school="{Technische Universit\"at Wien}",
year="1990"
)

@article(lauriere78,
author="Jean-Louis Lauriere",
title="A Language and a Program for Stating and for Solving
Combinatorial Problems",
journal=ai,
year="1978",
volume="10",
pages="29--127"
)

@article(mackworth77,
author="Alan K. Mackworth",
title="Consistency in Networks of Relations",
journal=ai,
year="1977",
volume="8",
pages="99--118"
)

@inproceedings(gaschnig77,
author="John Gaschnig",
title="A General Backtrack Algorithm That Eliminates Most Redundant
Tests",
booktitle="5th International Joint Conference on
Artificial Intelligence",
year="1977",
pages="457"
)

@unpublished(carlsson90,
author="Mats Carlsson",
title="Re: Arrays in {Prolog}",
note="Usenet News {``1990 Sep 25: 124816.12993@sics.se''}",
year="1990"
)

@misc(naish90,
author="Lee Naish",
year="1990",
howpublished="E-Mail Korrespondenz"
)

@article(nudel83,
author="Bernard Nudel",
title="Consistent Labeling Problems and their Algorithms: Expected
Complexities and Theory-Based Heuristics",
journal=ai,
year="1983",
volume="21",
pages="135--178"
)

@book(pearl84,
author="Judea Pearl",
title="Heuristics---Intelligent Search Strategies for Computer Problem
Solving",
publisher="Addison-Wesley",
year="1984"
)

@book(kaindl89,
author="Hermann Kaindl",
title="{Probleml\"osen durch heuristische Suche in der Artificial
Intelligence}",
publisher="Springer-Verlag",
year="1989",
address="Wien"
)

@article(gardner75,
author="Martin Gardner",
title="Mathematical Games",
journal="Scientific American",
year="1975",
volume="232",
number="4",
pages="126--133",
month=apr
)

@article(kubale&jackowski85,
author="Marek Kubale and Bogus{\l}aw Jackowski",
title="A General Implicit Enumeration Algorithm for Graph Coloring",
journal=cacm,
year="1985",
volume="28",
number="4",
pages="412--418",
month=apr
)

@phdthesis(koza89,
author="Christian Koza",
title="{Garantiertes Zeitverhalten in verteilten Echtzeitsystemen}",
school="{Technische Universit\"{a}t Wien}",
year="1989"
)

@inproceedings(krall&neumerkel90,
author="Andreas Krall and Ulrich Neumerkel",
title="The {Vienna Abstract Machine}",
booktitle="Programming Language Implementation and Logic Programming
(PLILP'90)",
year="1990",
OPTeditor="P. Deransart and J. Ma{\l}uzy\'nski",
pages="121--136",
publisher="Springer LNCS~456"
)

@inproceedings(neumerkel90,
author="Ulrich Neumerkel",
title="Extensible Unification by Metastructures",
booktitle="Meta-90",
year="1990",
address="Leuven"
)

@techreport(warren83,
author="David H. D. Warren",
title="An Abstract {Prolog} Instruction Set",
institution="SRI International",
year="1983",
number="309"
)

@inproceedings(lim&stuckey90,
author="Pierre Lim and Peter J. Stuckey",
title="A Constraint Logic Programming Shell",
booktitle="Programming Language Implementation and Logic Programming
(PLILP'90)",
year="1990",
OPTeditor="P. Deransart and J. Ma{\l}uzy\'nski",
pages="75--88",
publisher="Springer LNCS~456"
)

@inproceedings(naish86b,
author="Lee Naish",
title="Negation and Quantifiers in {NU-Prolog}",
booktitle=iclp3,
year="1986",
pages="625--634",
address="London",
publisher="Springer LNCS 225"
)

@article(dewdney86a,
author="A. K. Dewdney",
title="{Computer-Kurzweil}",
journal="Spektrum der Wissenschaft",
year="1986",
pages="5--11",
month=feb
)

@article(dewdney86b,
author="A. K. Dewdney",
title="{Computer-Kurzweil}",
journal="Spektrum der Wissenschaft",
year="1986",
pages="6--10",
month=may
)

@techreport{graf87,
author="Thomas Graf",
title="Extending Constraint Handling in Logic Programming to Rational
Arithmetic",
institution="ECRC",
year="1987",
type="Internal Report"
}

@phdthesis(graf89,
author="Thomas Graf",
title="Raisonnement sur les contraintes en programmation en logique",
school="Universit\'{e} de Nice -- Sophia Antipolis",
year="1989"
)

@article(buttner&simonis87,
author="W. Buttner and H. Simonis",
title="Embedding Boolean Expressions into Logic Programming",
journal="Journal of Symbolic Computation",
year="1987",
volume="4",
pages="191--205",
month=oct
)

@article(fisher81,
author="Joseph A. Fisher",
title="Trace Scheduling: A Technique for Global Microcode Compaction",
journal=ieeetc,
year="1981",
volume="30",
number="7",
pages="478--490",
month=jul,
annote="Trace Scheduling takes one (often used) path of the program
		 and schedules (compacts) without respect to basic
		 block boundaries. Correctness is ensured by inserting
		 compensation code in adjacent basic blocks. The
		 process is repeated until all basic blocks have been
		 scheduled."
)

@article(hu61,
author="T. C. Hu",
title="Parallel Sequencing and Assembly Line Problems",
journal="Operations Research",
year="1961",
volume="9",
number="6",
pages="841--848"
)

@book(kastens90,
author="Uwe Kastens",
title="{\"U}bersetzerbau",
publisher="R. Oldenbourg Verlag",
year="1990",
address="M{\"u}nchen"
)

@article(dincbas+90,
author="Mehmet Dincbas and Helmut Simonis and Van
Hentenryck, Pascal",
title="Solving Large Combinatorial Problems in Logic Programming",
journal="The Journal of Logic Programming",
year="1990",
number="8",
pages="75--93"
)

@article(hennessy&gross83,
author="John Hennessy and Thomas Gross",
title="Postpass Code Optimization of Pipeline Constraints",
journal=toplas,
year="1983",
volume="5",
number="3",
pages="422--448",
month=jul,
annote="Discusses instruction scheduling, shows its NP-completeness
		 and presents an heuristic algorithm for instruction
		 scheduling. The algorithm removes 47\% of the NOPs in
		 their example programs (optimum = 54\%)"
)

@inproceedings(davidson86,
author="Jack W. Davidson",
title="A Retargetable Instruction Reorganizer",
booktitle="SIGPLAN '86 Symposium on Compiler
Construction",
year="1986",
pages="234--241",
annote="Enhances PO (and Davidson-Fraser code generators like the GNU
		 C backend in general) to include evaluation order
		 optimization and targeting. The enhancement can be
		 used for instruction scheduling, too."
)

@article(colwell+88,
author="Robert P. Colwell and Robert P. Nix and John J. O'Donnel and
David B. Papworth and Paul K. Rodman",
title="A {VLIW} Architecture for a Trace Scheduling Compiler",
journal=ieeetc,
year="1988",
volume="37",
number="8",
pages="318--328",
month=aug
)

@inproceedings(vanhentenryck&dincbas87,
author="Van Hentenryck, Pascal and Mehmet Dincbas",
title="Forward Checking in Logic Programming",
crossref="iclp87",
pages="229--256"
)

@Proceedings{iclp87,
  key =		"ICLP-4",
  title = 	"Fourth International Conference on Logic
		 Programming (ICLP-4)", 
  booktitle = 	"Fourth International Conference on Logic
		 Programming (ICLP-4)",
  year = 	"1987",
  publisher = 	"MIT Press"
}

@manual(motorola90,
title="MC88100 RISC Microprocessor User's Manual",
organization="Motorola, Inc.",
edition="second",
year="1990"
)

@Article{hatcher91,
  author = 	"Philip J. Hatcher",
  title = 	"The Equational Specification of Efficient Compiler
		 Code Generation",
  journal = 	complang,
  year = 	"1991",
  volume = 	"16",
  number = 	"1",
  pages = 	"81--95",
  annote =	"A system (UCG) for the equational specification of code
		 generators is presented. The code generators work
		 by rewriting (explicit) trees with associated
		 actions, similar to Graham-Glanville code generators.
		 The tree rewriting system enables pre-code generation
		 transformations, too. The code generators are about
		 twice as fast as those produced by the
		 Davidson-Fraser and Graham-Glanville approaches and the
		 code generator of pcc."
}

@Article{dhamdhere88,
  author = 	"Dhananjay Madhav Dhamadhere",
  title = 	"Register Assignment Using Code Placement Techniques",
  journal = 	complang,
  year = 	"1988",
  volume = 	"13",
  number = 	"2",
  pages = 	"75--93",
  annote =	"Describes an algorithm that moves Load/Stores to good
		 places. The resulting program is guaranteed not to be
		 slower than the original program. Cites many papaers
		 that are not found in other register allocation
		 literature"
}

@InProceedings{wall86,
  author = 	"David W. Wall",
  title = 	"Global Register Allocation at Link Time",
  booktitle = 	"SIGPLAN '86 Symposium on Compiler Construction",
  year = 	"1986",
  pages = 	"264--275",
  OPTorganization = 	"ACM SIGPLAN",
  annote =	"Uses annotations from the compiler to do fast
		 interprocedural register allocation at link time.
		 Speedups of 10--20\% are obtained. Most (52--99\%) of
		 the removable memory references are removed. The
		 improvement over intraprocedural coloring allocation
		 is 1--8\%."
}

@Article{tjaden&flynn70,
  author = 	"Garold S. Tjaden and Michael J. Flynn",
  title = 	"Detection and Parallel Execution of Independent Instructions",
  journal = 	ieeetc,
  year = 	"1970",
  volume = 	"19",
  number = 	"10",
  pages = 	"889--895",
  month = 	oct,
  annote =	"A hardware method to achieve parallelism on a SISD
		 machine. Simultaneously decodes multiple instructions
		 and executes them with multiple execution units, but
		 keeps dependent instructions in the correct order.
		 Simulations show that 1.86 IBM~7090 instructions can be
		 executed per cycle without compiler assistance."
}

@InProceedings{gibbons&muchnick86,
  author = 	"Phillip B. Gibbons and Steve S. Muchnick",
  title = 	"Efficient Instruction Scheduling for a Pipelined Architecture",
  booktitle = 	"SIGPLAN '86 Symposium on Compiler Construction",
  year = 	"1986",
  pages = 	"11--16",
  OPTorganization = 	"ACM SIGPLAN",
  annote = 	"A heuristic algorithm for instruction scheduling with
		 $O(n^2)$ worst-case and linear observed complexity."
}

@Article{zima86,
  author = 	"Hans P. Zima",
  title = 	"A Constraint Language and its Interpreter",
  journal = 	complang,
  year = 	"1986",
  volume = 	"11",
  number = 	"2",
  pages = 	"65--83",
  annote = 	"The language looks pascaloid, supports built-in and
		 user-defined constraints on integer and real
		 variables and contains structuring constructs like
		 arrays and iterators. The interpreter analyses the
		 program and tries to find bindings for all variables.
		 If necessary, the user is requested to input values
		 for selected variables."
}

@Article{chow&hennessy90,
  author = 	"Fred C. Chow and John L. Hennessy",
  title = 	"The Priority-Based Coloring Approach to Register Allocation",
  journal = 	toplas,
  year = 	"1990",
  volume = 	"12",
  number = 	"4",
  pages = 	"501--536",
  month = 	oct,
  annote = 	"Assigns registers using a priority function. If no
		 registers are available, live ranges are split and
		 move or spill code is inserted. Includes a lot of data."
}

@Article{davidson&fraser84,
  author = 	"Jack W. Davidson and Christopher W. Fraser",
  title = 	"Code Selection through Object Code Optimization",
  journal = 	toplas,
  year = 	"1984",
  volume = 	"6",
  number = 	"4",
  pages = 	"505--526",
  month = 	oct,
  annote = 	"The code generator emits naive, but executable code
		 as register transfers (a kind of universal assembly),
		 which is transformed into efficient code by a
		 peephole optimizer."
}

@InProceedings{lam88,
  author = 	"Monica Lam",
  title = 	"Software Pipelining: An Effective Scheduling
		 Technique for {VLIW} Machines",
  booktitle = 	"SIGPLAN '88 Conference on
		 Programming Language Design and Implementation",
  year = 	"1988",
  pages = 	"318--328",
  OPTorganization = 	"ACM SIGPLAN",
  annote = 	"Software Pipelining enables the parallel execution of
		 multiple loop iterations by translating loops in a
		 prolog, a steady state and an epilog, in which the
		 pipeline is filled, stays full and is emptied,
		 respectively. The paper also presents hierarchical
		 reduction, which enables the application of software
		 pipelining to loops containing conditional statements."
}

@InProceedings{fraser&wendt88,
  author = 	"Christopher W. Fraser and Alan L. Wendt",
  title = 	"Automatic Generation of Fast Optimizing Code Generators",
  booktitle = 	"SIGPLAN '88 Conference on
		 Programming Language Design and Implementation",
  year = 	"1988",
  pages = 	"79--84",
  OPTorganization = 	"ACM SIGPLAN",
  annote = 	"Nonprocedural specifications for code generator and
		 peephole optimizer are used to compile a testbed of
		 programs. The record of the optimizations during
		 these compilations is used to generate a fast, hard-coded
		 integrated code generator and peephole optimizer."
}

@InProceedings{chow88,
  author = 	"Fred C. Chow",
  title = 	"Minimizing Register Usage Penalty at Procedure Calls",
  booktitle = 	"SIGPLAN '88 Conference on
		 Programming Language Design and Implementation",
  year = 	"1988",
  pages = 	"85--94",
  OPTorganization = 	"ACM SIGPLAN",
  annote =	"Describes inter-procedural register allocation at
		 compile time based on processing the procedures in a
		 one-pass depth-first traversal. Resorts to the
		 conventional scheme when there is insufficient
		 information (e. g. seperate compilation). Also
		 describes shrink-wrapping of saves and restores
		 to regions of activity to avoid having saves and
		 restores in paths where they are not needed.
		 Interprocedural register allocation has speedups of
		 -2.6--12\%, shrink-wrapping -0.2--2\%, with respect
		 to optimized code with intraprocedural register
		 allocation."
}

@InProceedings{benitez&davidson88,
  author = 	"Manuel E. Benitez and Jack W. Davidson",
  title = 	"A Portable Global Optimizer and Linker",
  booktitle = 	"SIGPLAN '88 Conference on
		 Programming Language Design and Implementation",
  year = 	"1988",
  pages = 	"329--338",
  annote =	"A nice treatment of the Davidson-Fraser approach
		 combined with global optimizations. The only thing
		 new seems to be the linker, which does some simple
		 interprocedural optimizations (call/entry
		 streamlining). They buy a speedup of 3--20\% on a
		 VAX and up to 5\% on a SUN~3."
}

@InProceedings{larus&hilfinger86,
  author = 	"James R. Larus and Paul N. Hilfinger",
  title = 	"Register Allocation in the SPUR Lisp Compiler",
  booktitle = 	"SIGPLAN '86 Symposium on Compiler Construction",
  year = 	"1986",
  pages = 	"255--263",
  annote = 	"Chow's priority-based coloring in the context of
		 register windows."
}

@InProceedings{fraser&wendt86,
  author = 	"Christopher W. Fraser and Alan L. Wendt",
  title = 	"Integrating Code Generation and Optimization",
  booktitle = 	"SIGPLAN '86 Symposium on Compiler Construction",
  year = 	"1986",
  pages = 	"242--248",
  annote = 	"An improvement on PO: At compile-compile time PO runs
		 on a testbed and generates a fixed set of rules that
		 are used at compile-time by a fast, rule-directed
		 optimizer, that avoids much string scanning."
}

@InProceedings{bernstein+89,
  author = 	"David Bernstein and Dina Q. Goldin and Martin C.
		 Golumbic and Yishay Mansour and Itai Nahshon and Ron
		 Y. Pinter",
  title = 	"Spill Code Minimization Techniques for Optimizing Compilers",
  crossref =	"sigplan89",
  pages = 	"258--263",
  annote = 	"Improves Chaitins algorithm by using three heuristics
		 for using the next variable to spill and choosing the
		 best one with respect to a cost function."
}

@InProceedings{gupta+89,
  author = 	"Rajiv Gupta and Mary Lou Soffa and Tim Steele",
  title = 	"Register Allocation Via Clique Separators",
  crossref =	"sigplan89",
  pages = 	"264--274",
  annote = 	"Graphs are easier to color by decomposing them using
		 clique seperators (cliques that are the only
		 connection of the parts) and coloring the parts. For
		 straight line code the live ranges at any point of
		 time are clique seperators. Branches are handled like
		 in trace scheduling. The technique is claimed as being
		 more efficient at coloring time as well as producing
		 better allocations."
}

@InProceedings{briggs+89,
  author = 	"Preston Briggs and Keith D. Cooper and Ken Kennedy
		 and Linda Torczon",
  title = 	"Coloring Heuristics for Register Allocation",
  crossref =	"sigplan89",
  pages = 	"275--284",
  annote = 	"Improves Chaitins Allocator by using a better coloring
		 algorithm and by deffering the spill decision. The
		 dynamic improvement over Chaitins algorithm is 1\%
		 for large floating-point programs. It increases as
		 the number of registers decreases (15\% dynamic
		 improvement for integer code with 8 registers). The
		 allocation time stays the same."
}

@InProceedings{emmelmann+89,
  author = 	 {Helmut Emmelmann and Friedrich-Wilhelm Schr\"oer and
                  Rudolf Landwehr},
  title = 	 {{BEG} -- a Generator for Efficient Back Ends},
  crossref =	"sigplan89",
  pages = 	"227--237"
}

@Proceedings{sigplan89,
  key =		"SIGPLAN~'89",
  booktitle = 	"SIGPLAN~'89 Conference on
		 Programming Language Design and Implementation",
  title = 	"SIGPLAN~'89 Conference on
		 Programming Language Design and Implementation",
  year = 	"1989",
}

@InProceedings{johnson&miller86,
  author = 	"Mark Scott Johnson and Terence C. Miller",
  title = 	"Effectiveness of a Machine-Level, Global Optimizer",
  booktitle = 	"SIGPLAN '86 Symposium on Compiler Construction",
  year = 	"1986",
  pages = 	"99--107",
  annote = 	"Gives an overview of the back end of the HP Precicion
		 Architecture compilers."
}

@Article{ganapathi89,
  author = 	"Mahadevan Ganapathi",
  title = 	"Prolog Based Retargetable Code Generation",
  journal = 	complang,
  year = 	"1989",
  volume = 	"14",
  number = 	"3",
  pages = 	"193--204",
  annote = 	"Code Generation is quite conventional (tree
		 rewriting). Also discusses some optimizations, but I
		 did not get him."
}

@Article{dhamdhere90,
  author = 	"Dhananjay Madhav Dhamdhere",
  title = 	"A Usually Linear Algorithm for Register Assignment
		 Using Edge Placement of Load and Store Instructions",
  journal = 	complang,
  year = 	"1990",
  volume = 	"15",
  number = 	"2",
  pages = 	"83--94",
  annote = 	"Improves on his earlier Load/Store-movement algorithm
		 by placing the loads and stores on edges of the flow
		 graph. It produces better code than the older
		 algorithm and runs faster."
}

@Article{golumbic&rainish90,
  author = 	"Martin Charles Golumbic and Vladimir Rainish",
  title = 	"Instruction Scheduling Beyond Basic Blocks",
  journal = 	ibmjrd,
  year = 	"1990",
  volume = 	"34",
  number = 	"1",
  pages = 	"93--97",
  month = 	jan,
  annote = 	"Discusses some rearrangements that reduce pipeline
		 stalls around branches for the RS/6000. Very
		 processor-specific and not very ingenious."
}

@Article{warren90,
  author = 	"Warren, Jr., Henry S.",
  title = 	"Instruction Scheduling for the {IBM RISC System/6000} processor",
  journal = 	ibmjrd,
  year = 	"1990",
  volume = 	"34",
  number = 	"1",
  pages = 	"85--92",
  month = 	jan,
  annote = 	"Gives a description of the delays that can happen in
		 the RS/6000. It does the usual scheduling algorithm.
		 One of the secondary heuristics (liveness weight)
		 should reduce register pressure. The scheduling is
		 done before and after register allocation."
}

@InProceedings{bradlee+91asplos,
  author = 	"David G. Bradlee and Susan J. Eggers and Robert R. Henry",
  title = 	"Integrating Register Allocation and Instruction
		 Scheduling for {RISCs}",
  crossref = 	"asplos91",
  pages = 	"122--131",
  annote = 	"Compares three strategies: Postpass; a variant of
		 Goodmans and Hsu's Integrated Prepass Scheduling (IPS),
		 where instruction scheduling is performed before
		 register allocation, but considers register
		 allocation; and their own RASE, where the register
		 allocator considers the effects of its choices on the
		 instruction scheduler. Floating-point and integer
		 programs are measured on the 88100, R2000 and i860.
		 IPS and RASE are much better than postpass; RASE is
		 better than IPS in a few cases, but is not cost-effective."
}

@InProceedings{lee+91,
  author = 	"Roland L. Lee and Alex Y. Kwok and Fay\'e A. Briggs",
  title = 	"The Floating-Point Performance of a Superscalar
		 {SPARC} Processor",
  crossref = 	"asplos91",
  pages = 	"28--37",
  annote = 	"Compares loop unrolling, software pipelining and
		 the combination for floating point programs on SPARC
		 processors with 1, 2, or 4 instructions/cycle."
}

@InProceedings{goodman&hsu88,
  author = 	"James R. Goodman and Wei-Chung Hsu",
  title = 	"Code Scheduling and Register Allocation in Large
		 Basic Blocks",
  booktitle = 	"International Conference on Supercomputing",
  year = 	"1988",
  pages = 	"442--452",
  annote = 	"After an overview of the phase ordering problems in
		 instruction scheduling and register allocation two
		 algorithms are introduced: Integrated Prepass
		 Scheduling keeps track of the number of registers
		 left and switches between scheduling to minimize
		 pipeline stalls and scheduling to minimize register
		 usage accordingly. A variation of this algorithm also
		 spills registers in certain circumstances. DAG-Driven
		 Register Allocation is to be used with a postpass
		 scheduler and tries to allocate the registers without
		 generating new dependencies. If this cannot be
		 achieved, the register is chosen in a way that
		 minimizes the path length of the additional paths.
		 The two algorithms (combined with a register
		 allocator and an instruction scheduler, respectively)
		 perform better than the usual prepass, postpass
		 or two-pass approaches."
}

@TechReport{waltz72,
  author = 	"D. Waltz",
  title = 	"Generating Semantic Descriptions from Drawings of
		 Scenes with Shadows",
  institution = "MIT",
  year = 	"1972",
  number = 	"AI271"
}

@Book{koopman89,
  author = 	"Koopman, Jr., Philip J.",
  title = 	"Stack Computers",
  publisher = 	"Ellis Horwood Limited",
  year = 	"1989",
  url =		"http://www.cs.cmu.edu/~koopman/stack_computers/index.html",
  annote = 	"Presents the architecture of recent stack processors
		 coming from the Forth community. Compares stack
		 machines with conventional machines. The results:
		 For real-time applications stack machines are better,
		 because their worst-case performance is higher.
		 Contrary to common opinion stack machines have faster
		 context-switches than conventional machines."
}

@InProceedings{proebsting&fischer91,
  author = 	"Todd A. Proebsting and Charles N. Fischer",
  title = 	"Linear-time, Optimal Code Scheduling for Delayed-Load
		 Architectures",
  crossref =	"sigplan91",
  pages = 	"256--267",
  annote = 	"This algorithm generates optimal schedules with
		 optimal register usage for a very limited problem:
		 binary expression trees on a load/store architecture
		 with a one cycle delay on loads, where the leaf nodes
		 are loads, i.e. no register variables, no constants,
		 no unary operators, no common subexpression
		 elimination, and scheduling is restricted to one
		 statement at a time. It generates optimal register
		 spill code. Also, the (sub-optimal) application of the
		 algorithm to DAGs and longer delays is treated." 
}

@InProceedings{jain91,
  author = 	"Suneel Jain",
  title = 	"{Circular Scheduling}: A new Technique to Perform
		 Software Pipelining",
  crossref =	"sigplan91",
  pages = 	"219--228",
  annote = 	"Performs software pipelining by selecting an
		 instruction without predecessors from the dependency
		 DAG of the loop body, moving it to the end of the
		 loop and rescheduling. This step is repeated until
		 the schedule becomes a lot worse. Then the best
		 schedule is used. For compensation the unmoved
		 instructions are used as prolog and the moved ones as
		 epilog. A register renaming algorithm to reduce
		 dependencies introduced by register allocation is
		 introduced, too. The improvement on the Livermore
		 loops is between -5\% and 53\% on the MIPS R6010."
}

@Article{bernhardsson91,
  author = 	"Bo Bernhardsson",
  title = 	"Explicit Solutions to the N-Queens Problem for all N",
  journal = 	sigart,
  year = 	"1991",
  volume = 	"2",
  number = 	"2",
  pages = 	"7",
}

@Article{sosic&gu91,
  author = 	"Rok Sosic and Jun Gu",
  title = 	"3,000,000 Queens in Less Than One Minute",
  journal = 	sigart,
  year = 	"1991",
  volume = 	"2",
  number = 	"2",
  pages = 	"22--24",
  annote = 	"This algorithm sets almost all queens in a
		 conflict-free manner, but the last few queens are
		 allowed to have diagonal conflicts. These conflicts
		 are then eliminated by selecting two queens (one
		 systematically and one randomly) and swapping their
		 columns if this improves the state of affairs. The
		 algorithm runs in linear time."
}

@Article{gu91,
  author = 	"Jun Gu",
  title = 	"On a General Framework for Large-Scale
		 Constraint-Based Optimization",
  journal = 	sigart,
  year = 	"1991",
  volume = 	"2",
  number = 	"2",
  pages = 	"8",
  annote = 	"An answer to \cite{bernhardsson91}"
}

@InProceedings{morris91,
  author = 	"W. G. Morris",
  title = 	"{CCG}: A Prototype Coagulating Code Generator",
  crossref =	"sigplan91",
  pages = 	"45--58",
  annote = 	"Basic blocks are treated (code generation, register
		 allocation,\ldots) in decreasing order of (measured
		 or estimated) execution frequency. Later blocks have
		 to adapt to decisions made in earlier blocks.
		 Therefore the most frequent blocks can be compiled
		 very well. Procedure calls are treated like other
		 control flow constructs, resulting in automatic
		 interprocedural optimization. The average speedup on
		 gcc is 25\% on a Sun~3, but I had the impression that
		 they used the same benchmarks for development and the
		 measurements. The speedups are mainly from smaller
		 procedure call overhead, so they are twice as
		 valuable. There was apparently no separate
		 compilation."
}

@InProceedings{wall91pldi,
  author = 	"David W. Wall",
  title = 	"Predicting Program Behaviour Using Real or Estimated Profiles",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"59--70",
  address = 	"Toronto",
  journal = 	sigplan,
  OPTvolume = 	"26",
  OPTnumber = 	"6",
  OPTmonth = 	jun,
  annote = 	"Checks estimated profiles and real profiles measured on
		 different input data for their accuracy and
		 relevance. Real profiles prove best, estimated
		 profiles are better than random profiles in most
		 cases. The best estimated profiles are based on loop
		 nesting level combined with static call counts."
}

@InProceedings{mcfarling91,
  author =	 {Scott McFarling},
  title =	 {Procedure Merging with Instruction Caches},
  crossref =	 {sigplan91},
  pages =	 {71--79},
  annote =	 {Inlining is steered by a heuristic that considers
                  the I-cache size; in particular, it tries to fit
                  loops into the cache. The empirical part compares
                  this new heuristics with heuristics based on size
                  and heuristics based on the ratio of size and
                  dynamic execution counts.}
}

@InProceedings{jaffar+91,
  author = 	"Joxan Jaffar and Spiro Michaylov and Roland H. C. Yap",
  title = 	"A Methodology for Managing Hard Constraints in CLP Systems",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"306--316",
  address = 	"Toronto",
  journal = 	sigplan,
  volume = 	"26",
  number = 	"6",
  month = 	jun,
  annote = 	"Describes the delay mechanism of CLP($\cal R$).
		 Delayed constraints are pushed on a special stack;
		 every time one of their responsible variables is
		 changed, their new state is pushed. Responsible
		 variables are connected to the constraints through a
		 complicated access structure. Backtracking is done by
		 popping the constraints and rebuilding the access
		 structures (no trailing and untrailing)."
}

@InProceedings{pugh91,
  author = 	"William Pugh",
  title = 	"Advice to Authors of Extended Abstracts",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"353-356",
  OPTaddress = 	"Toronto",
  journal = 	sigplan,
  OPTvolume = 	"26",
  OPTnumber = 	"6",
  OPTmonth = 	jun,
  annote = 	"How to get your paper accepted."
}

@InProceedings{bradlee+91pldi,
  author = 	"David G. Bradlee and Robert R. Henry and Susan J. Eggers",
  title = 	"The {Marion} System for Retargetable Instruction Scheduling",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"229--240",
  address = 	"Toronto",
  OPTjournal = 	sigplan,
  OPTvolume = 	"26",
  OPTnumber = 	"6",
  OPTmonth = 	jun,
  annote = 	"A back end generator for RISCs, consisting of simple
		 instruction selection and several strategies for the
		 combination of register allocation and instruction
		 scheduling (Postpass, IPS, RASE). The maschine
		 description describes the resources needed by an
		 instruction and contains means to describe
		 explicitely advanced pipelines (i860), too. Machine
		 descriptions were developed for the 88100, the R2000
		 and the i860. The quality of the generated code is
		 between the MIPS compiler's -O1 and -O2 levels."
}

@Proceedings{sigplan91,
  key =		"SIGPLAN~'91",
  booktitle = 	"SIGPLAN~'91 Conference on
		 Programming Language Design and Implementation",
  title = 	"SIGPLAN~'91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  OPTaddress = 	"Toronto",
  OPTjournal = 	sigplan,
  OPTvolume = 	"26",
  OPTnumber = 	"6",
  OPTmonth = 	jun,
}

@InProceedings{chang+91,
  author = 	"Pohua P. Chang and Scott A. Mahlke and William Y.
		 Chen and Nancy J. Warter and {Wen-mei} W. Hwu",
  title = 	"{IMPACT}: An Architectural Framework for
		 Multiple-Instruction-Issue Processors",
  crossref = 	"isca91",
  pages = 	"266--275",
  annote = 	"Describes optimizing compiler, that uses trace
		 scheduling to increase instrcution level parallelism.
		 They call their approach general percolation. Empirical
		 results are presented: Their software approach does
		 nearly as well as speculative execution, a hardware
		 method, if trap-generation by division by zero or
		 illegal memory references is disabled. If such
		 instructions can produce traps, they can not be moved
		 as much (restricted percolation), resulting in
		 noticably lower performance."
}

@InProceedings{butler+91,
  author = 	"Michael Butler and Tse-Yu Yeh and Yale Patt and
		 Mitch Alsup and Hunter Scales and Michael Shebanow",
  title = 	"Single Instruction Stream Parallelism Is Greater Than Two",
  crossref = 	"isca91",
  pages = 	"276--286",
  annote = 	"A study on how much instruction-level parallelism can
		 be achieved on different hardware models. Code
		 produced from a conventional 88k-compiler is used,
		 but better hardware featuring dynamic scheduling and
		 speculative execution is simulated. If data
		 dependencies are the only source of serialization,
		 the SPEC suite has a parallelism of 17--1165. On
		 realistic machine having eight fuctional units
		 integer parallelism is 2.4--3.4, floating point
		 1.9--5.8 without considering cache misses.
		 The paper identifies the bottlenecks in the simulated
		 architectures."
}

@InProceedings{melvin&patt91,
  author = 	"Stephen Melvin and Yale Patt",
  title = 	"Exploiting Fine-Grain Parallelism Through a
		 Combination of Hardware and Software Techniques",
  crossref = 	"isca91",
  pages = 	"287--296",
  annote = 	"The parallelism available through dynamic scheduling
		 and speculative execution can be enhanced with basic
		 block enlargement, a technique similar to trace
		 scheduling combined with loop unrolling."
}

@InProceedings{quammen&miller91,
  author = 	"Donna J. Quammen and P. Richard Miller",
  title = 	"Flexible Register Management for Sequential Programs",
  crossref = 	"isca91",
  pages = 	"320--329",
  annote = 	"Describes a more flexible register window mechanism
		 (threaded or t-windows), where windows can be
		 allocated freely. This can be used to simulate a single
		 register set, usual register windows, and register
		 stack. The main advantage of this scheme is in
		 concurrent systems, where you can use a window stack
		 for every process."
}

@InProceedings{bradlee+91isca,
  author = 	"David G. Bradlee and Susan J. Eggers and Robert R. Henry",
  title = 	"The Effect on RISC Performance of Register Set Size
		 and Structure Versus Code Generation Strategy",
  crossref = 	"isca91",
  pages = 	"330--339",
  annote = 	"Machines with a single register set and slow floating
		 point units (like 88100) are compared to machines
		 with separate register sets and a fast FP unit (e.g.
		 R3000). Machines with fast FP should use the FP unit
		 for integer multiply even if they have a split
		 register set."
}

@Proceedings{isca91,
  key =		"ISCA-18",
  booktitle = 	"The $18^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  title = 	"The $18^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  year = 	"1991",
  address = 	"Toronto",
  journal = 	can,
  OPTvolume = 	"19",
  OPTnumber = 	"3",
  OPTmonth = 	may,
}

@Book{hennessy&patterson90,
  author = 	"John L. Hennessy and David A. Patterson",
  title = 	"Computer Architecture. A Quantitative Approach",
  publisher = 	"Morgan Kaufman Publishers",
  year = 	"1990",
}

@InProceedings{callahan&koblenz91,
  author = 	"David Callahan and Brian Koblenz",
  title = 	"Register Allocation via Hierarchical Graph Coloring",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"192--203",
  address = 	"Toronto",
  journal = 	sigplan,
  OPTvolume = 	"26",
  OPTnumber = 	"6",
  OPTmonth = 	jun,
  annote = 	"The register allocator builds a tile tree
		 representing the conrol hierarchy (loops, ifs, etc.)
		 of the procedure. Then the tiles are colored
		 bottom-up, using Chaitins algorithm for every tile.
		 At last, spill code is inserted on rarely used tile
		 boundaries. It sounds very good, but I miss an
		 empirical comparison with other approaches."
}

@InProceedings{bernstein&rodeh91,
  author = 	"David Bernstein and David Rodeh",
  title = 	"Global Instruction Scheduling for Superscalar Machines",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"241--255",
  address = 	"Toronto",
  journal = 	sigplan,
  OPTvolume = 	"26",
  OPTnumber = 	"6",
  OPTmonth = 	jun,
  annote = 	"Describes instruction scheduler that moves
		 instructions between basic blocks. It can even move
		 instructions to places where they are not always
		 useful (speculative execution at compile-time). Using
		 this scheduler increases compile-time by 12\%--17\%
		 and decreases run-time by 0\%--7\% when compared with
		 the usual RS/6000 compiler."
}

@InProceedings{freeman&pfening91,
  author = 	"Tim Freeman and Frank Pfening",
  title = 	"Refinement Types for ML",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"268--277",
  address = 	"Toronto",
  journal = 	sigplan,
  volume = 	"26",
  number = 	"6",
  month = 	jun,
  annote = 	"Adds (explicitely defined) subtypes to MLs type
		 system, which can help the static type checking."
}

@InProceedings{cartwright&fagan91,
  author = 	"Robert Cartwright and Mike Fagan",
  title = 	"Soft Typing",
  booktitle = 	"SIGPLAN '91 Conference on
		 Programming Language Design and Implementation",
  year = 	"1991",
  pages = 	"278--292",
  address = 	"Toronto",
  journal = 	sigplan,
  volume = 	"26",
  number = 	"6",
  month = 	jun,
  annote = 	"Expands MLs type system to gain expressive power.
		 Not all programs using the extended type system can
		 be type-checked at compile time, so run-time checks
		 are inserted. The goals in designing the type-system
		 were to have implicit typing and to be able to check
		 the majority of the programs at compile-time, so the
		 user takes warnings about inserted run-time checks
		 seriously."
}

@Article{grabienski91,
  author = 	"Peter Grabienski",
  title = 	"A Stack-Oriented Multiprocessing System",
  journal = 	can,
  year = 	"1991",
  volume = 	"19",
  number = 	"1",
  pages = 	"120--127",
  month = 	mar,
  annote = 	"Combines Forth processor with links similar to a
		 transputer. Unlike transputers, the links are
		 byte-wide (10 MB/s) and use hardware routing."
}

@InProceedings{cytron+89,
  author = 	"Ron Cytron and Jeanne Ferrante and Barry K. Rosen and
		 Mark N. Wegman and F. Kenneth Zadeck",
  title = 	"An Efficient Method of Computing Static Single
		 Assignment Form",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"25--35",
  address = 	"Austin, Texas",
  annote = 	"Static Single Assignment (SSA) form is equivalent to a data
		 flow graph in basic blocks and $\phi$-Functions for
		 merging of data flow edges, when control flow merges.
		 This form is useful for optimization (they give many
		 references). In this article an algorithm for
		 computing the SSA form is presented. It is shown that
		 the algorithm is linear with the size of the program
		 on programs using if- and while-structures. Empirical
		 data on irreducible programs suggests that this is
		 usually also true for these programs."
}

@InProceedings{sagiv+89,
  author = 	"Resolving Circularity in Attribute Grammars with
		 Applications to Data Flow Analysis",
  title = 	"S. Sagiv and O. Edelstein and N. Francez and M. Rodeh",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"36--48",
  address = 	"Austin, Texas",
  annote = 	"How to transform circular attributed grammars, where
		 a unique fixed point can be computed, into
		 noncircular AGs. Describe the applications of such
		 AGs to data flow analysis."
}

@InProceedings{wadler&blott89,
  author = 	"Philip Wadler and Stephen Blott",
  title = 	"How To Make {\em ad-hoc} Polymorphism Less {\em ad hoc}",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"60--76",
  address = 	"Austin, Texas",
  annote = 	"Describe type classes, a generalization of ML's
		 eqtype variables. A type class is a set of types,
		 which have some user-defined operations. E.g. Num =
		 (+), (*), Negate; Instances: Int, Float. Extensible
		 to arbitrary types, the operations must be defined
		 for every type. For convenience there are also
		 subclasses. The translation into an SML-like language
		 is described (it works with a method dictionary).
		 Type classes result in a lot of writing."
}

@InProceedings{kanellakis&mitchell89,
  author = 	"Paris C. Kanellakis and John C. Mitchell",
  title = 	"Polymorphic unification and {ML} typing",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"105--115",
  address = 	"Austin, Texas",
  annote = 	"Proves that polymorphic unification, i.e. ML typing
		 is PSPACE hard. The complexity comes from {\tt let}.
		 However, practical programs with many {\tt let}s can
		 be typed without problems."
}

@InProceedings{yelick&zachary89,
  author = 	"Katherine A. Yellick and Joseph L. Zachary",
  title = 	"Moded Type Systems for Logic Pogramming",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"116-124",
  address = 	"Austin, Texas",
  annote = 	"Describes a mode system for equational logic
		 programming languages, that narrows the gap between
		 declarative and procedural semantics. It is shown
		 that two predicate implementations with the same
		 declarative meaning will be operationally equivalent."
}

@InProceedings{hickey89,
  author = 	"Timothy J. Hickey",
  title = 	"{CLP*} and Constraint Abstraction",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"125--133",
  address = 	"Austin, Texas"
}

@InProceedings{cardelli+89,
  author = 	"Luca Cardelli and Jim Donahue and Mick Jordan and
		 Bill Kalsow and Greg Nelson",
  title = 	"The {Modula-3} Type System",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"202--212",
  address = 	"Austin, Texas",
  annote = 	"The subtype relation is central to the system. The
		 unusual parts are traced references (garbage
		 collection), Exceptions as part of the procedure type
		 and the global use of structural equivalence.
		 Pointers and Arrays can be assigned to subtypes, the
		 correctness of the operation is checked at run time.
		 Objects are records, methods are procedure variables
		 with self as first operand; the actual method
		 (procedure) is determined at object creation time (new)."
}

@InProceedings{abadi+89,
  author = 	"Mart\'in Abadi and Luca Cardelli and Benjamin Pierce
		 and Gordon Plotkin",
  title = 	"Dynamic Typing in a Statically-Typed Language",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"213--227",
  address = 	"Austin, Texas",
  annote = 	"Introduce a type {\tt Dynamic} and describe its
		 semantics. The handling of the type is somewhat tiring."
}

@InProceedings{parker89,
  author = 	"D. Stott Parker",
  title = 	"Partial Order Programming",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"260--266",
  address = 	"Austin, Texas",
  annote = 	"Describes a framework for several kinds of problems,
		 among them constraint satisfaction problems, and some
		 properties of the framework."
}

@InProceedings{kelsey&hudak89,
  author = 	"Richard Kelsey and Paul Hudak",
  title = 	"Realistic Compilation by Program Transformation",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"281--292",
  address = 	"Austin, Texas",
  annote = 	"A back end, using lambda calculus with an implicit
		 store as intermediate language. The phases are:
		 making the program linear (no nested expressions),
		 adding explicit continuations, simplifying, adding
		 environments, identifier renaming/register
		 allocation. Not completely denotational. The code
		 produced is about as good as that of the Apollo
		 Pascal compiler."
}

@InProceedings{appel&jim89,
  author = 	"Andrew W. Appel and Trevor Jim",
  title = 	"Continuation-Passing, Closure-Passing style",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"293--302",
  address = 	"Austin, Texas",
  annote = 	"ML compiler based on continuation passing style.
		 After the translation into CPS and optimization
		 closures are made explicit and registers are
		 allocated. No stack is used for the closures. Instead
		 the compiler relies on garbage collection."
}

@InProceedings{pugh&teitelbaum89,
  author = 	"William Pugh and Tim Teitelbaum",
  title = 	"Incremental Computation via Function Caching",
  booktitle = 	"Sixteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1989",
  pages = 	"315--328",
  address = 	"Austin, Texas"
}

@Book{hopl81,
  title = 	"History of Programming Languages",
  publisher = 	"ACM Press",
  year = 	"1981",
  editor = 	"Richard L. Wexelblatt",
}

@Article{kuga+91,
  author = 	"Morihiro Kuga and Kazuki Murakami and Shinji Tomita",
  title = 	"{DSNS} (dynamically-hazard-resolved,
		 statically-code-scheduled, nonuniform superscalar).
		 {Yet} Another Superscalar Processor Architecture",
  journal = 	can,
  year = 	"1991",
  volume = 	"19",
  number = 	"4",
  pages = 	"14--29",
  month = 	jun,
  annote = 	"Features several sorts of load instructions differing
		 in the execution order they require (strongly, weakly
		 and un-ordered). The specific load instructions are
		 selected by the compiler using aliasing information."
}

@Article{ponder91,
  author = 	"Carl Ponder",
  title = 	"Performance Variation Across Benchmarks",
  journal = 	can,
  year = 	"1991",
  volume = 	"19",
  number = 	"4",
  pages = 	"30--36",
  month = 	jun,
  annote = 	"Compares how much various benchmarks differ when run
		 on two machines. Livermore loops, Dhry- and Whetstone
		 are better than their reputation, application
		 benchmarks can be deceptive, too."
}

@Article{conte&hwu91,
  author = 	"Thomas M. Conte and {Wen-mei} Hwu",
  title = 	"A Brief Survey of Benchmark Usage in the Architecture
		 Community",
  journal = 	can,
  year = 	"1991",
  volume = 	"19",
  number = 	"4",
  pages = 	"37--44",
  month = 	jun,
  annote = 	"Takes the papers of the ISCAs '84--'90, classifies
		 the used Benchmarks, and analyses the usage patterns."
}

@Article{fraser&hanson91a,
  author = 	"Christopher W. Fraser and David R. Hanson",
  title = 	"A Code Generation Interface for {ANSI C}",
  journal = 	spe,
  year = 	"1991",
  volume = 	"21",
  number = 	"9",
  pages = 	"963--988",
  month = 	sep,
  annote = 	"Describes call-interface and intermediate code (DAGs)
		 for a compiler that passes information between phases
		 through memory. Very practice-oriented (listings
		 etc.)"
}

@Article{fraser&hanson91b,
  author = 	"Christopher W. Fraser and David R. Hanson",
  title = 	"A Retargetable Compiler for {ANSI C}",
  journal = 	sigplan,
  year = 	"1991",
  volume = 	"26",
  number = 	"10",
  pages = 	"29--43",
  month = 	oct,
  annote = 	"Describes a fast C compiler and how they made it fast."
}

@Article{landskov+80,
  author = 	"David Landskov and Scott Davidson and Bruce Shriver
		 and Pattrick W. Mallet",
  title = 	"Local Microcode Compaction Techniques",
  journal = 	acmcs,
  year = 	"1980",
  volume = 	"12",
  number = 	"3",
  pages = 	"261--294",
  month = 	sep
}

@Article{rodriguez90,
  author = 	"Brad Rodriguez",
  title = 	"A {BNF} Parser in {Forth}",
  journal = 	sigforth,
  year = 	"1990",
  volume = 	"2",
  number = 	"2",
  pages = 	"13--15",
  month = 	dec,
  url =		"http://www.forth.org/bnfparse.html",
  annote = 	"Describes top-down parsing with backtracking in
		 Forth; includes a listing."
}

@InProceedings{kessler+91,
  author = 	 "C. W. Ke{\ss}ler and W. J. Paul and T. Rauber",
  title = 	 "A Randomized Heuristic Approach to Register Allocation",
  crossref =  "plilp91",
  pages = 	 "195--206"
}

@Proceedings{plilp91,
  key =		"PLILP'91",
  title = 	"Programming Language Implementation and Logic
		 Programming (PLILP)",
  booktitle = 	"Programming Language Implementation and Logic
		 Programming (PLILP)",
  year = 	"1991",
  OPTeditor = 	"Jan Ma{\l}uszy\'nski and Martin Wirsing",
  publisher = 	"Springer LNCS~528",
  address = 	"Passau"
}

@InProceedings{haberler&ertl89,
  author = 	"Michael Haberler and Martin Ertl",
  title = 	"Offloading A Mainframe or Teaching A Spreadsheet How
		 To Access Big Databases",
  booktitle = 	"EUUG Autumn '89 Conference",
  year = 	"1989",
  pages = 	"45--49",
  address = 	"Wien"
}

@Article{ertl91,
  author = 	"M. A. Ertl",
  title = 	"{Kurzfassung der Diplomarbeit ``Coroutining und
		 Constraints in der Logik-Programmierung''}",
  journal = 	"{\"{O}GAI Journal}",
  year = 	"1991",
  volume = 	"9",
  number = 	"4",
  pages = 	"12--20",
  month = 	mar
}

@TechReport{fraser&hanson90,
  author = 	"Christopher W. Fraser and David R. Hanson",
  title = 	"A Code Generation Interface for ANSI C",
  institution = 	"AT&T Bell Laboratories",
  year = 	"1990",
  type = 	"Research Report",
  number = 	"CS-TR-270-90",
  note =	"Revision of September 1991",
  annote = 	"The interface of the {\tt lcc} front end to the back
		 end consists of a few functions that are called and
		 data structures for expression dags and symbols. The
		 interface is explained by using a simple vax back end
		 as an example."
}

@TechReport{rodriguez89,
  author = 	"Brad Rodriguez",
  title = 	"Moving {Forth}: Principles of Metacompilation",
  institution = 	"T-Recursive Technology",
  year = 	"1989",
  address = 	"55 McCaul St. \#14, Toronto, Ontario M5T 2W7 Canada",
  url =		"http://www.zetetics.com/bj/papers/",
  annote =	"Forth Metacompilation is the art of creating a new
		 Forth System (possibly for a different machine) on
		 the current one. This paper explains the concepts
		 quite well."
}

@InProceedings{pelegri-llopart&graham88,
  author = 	"Eduardo Pelegr\'\i-Llopart and Susan L. Graham",
  title = 	"Optimal Code Generation for Expression Trees: An
		 Application of the {BURS} Theory",
  booktitle = 	"Fifteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1988",
  pages = 	"294--308",
  annote = 	"Describes bottom-up rewrite systems. The first half
		 is very dry and theoretical (definitions and
		 propositions). The second half gives empirical
		 results for applying BURS to code selection and makes
		 comparisons with other approaches."
}

@Article{aho+77,
  author = 	"A. V. Aho and S. C. Johnson and J. D. Ullman",
  title = 	"Code Generation for Expressions with Common
		 Subexpressions",
  journal = 	jacm,
  year = 	"1977",
  volume = 	"24",
  number = 	"1",
  pages = 	"146--160",
  month = 	jan,
  annote = 	"Shows that optimal copy generation in dags for
		 two-address machines is NP-complete. Contrary to
		 popular belief, this paper does not show, that optimal
		 instruction selection in dags is NP-complete in
		 general. Also discussed are linear-time code selection
		 algorithms and their deviations from optimality and
		 optimal algorithms and their complexity."
}

@InProceedings{chase87,
  author = 	"David R. Chase",
  title = 	"An Improvement To Bottom-up Tree Pattern Matching",
  booktitle = 	"Fourteenth Annual {ACM} Symposium on Principles of
		 Programming Languages",
  year = 	"1987",
  pages = 	"168--177",
}

@Manual{fraser+91,
  title = 	"{\sc Burg} --- Fast Optimal Instruction Selection and
		 Tree Parsing",
  author = 	"Christopher W. Fraser and Robert R. Henry and Todd
		 A. Proebsting",
  year = 	"1991",
  url =		"ftp://kaese.cs.wisc.edu/pub/burg.shar.Z",
  annote =	"A BURS tree pattern matcher generator."
}

@Article{fraser+92,
  author = 	 "Christopher W. Fraser and Robert R. Henry and Todd
		 A. Proebsting",
  title = 	 "{\sc Burg} --- Fast Optimal Instruction Selection and
		 Tree Parsing",
  journal =	 sigplan,
  year =	 "1992",
  volume =	 "27",
  number =	 "4",
  pages =	 "68-76",
  month =	 apr,
  url =		 "ftp://kaese.cs.wisc.edu/pub/burg.shar.Z"
}

@string{loplas = "ACM Letters on Programming Languages and Systems"}

@Article{fraser+93,
  author = 	 "Christopher W. Fraser and David R. Hanson and Todd
		 A. Proebsting",
  title = 	 "Engineering a simple, efficient code generator generator",
  journal =	 loplas,
  year =	 "1993",
  OPTvolume = 	 "",
  OPTnumber = 	 "",
  OPTpages = 	 "",
  OPTmonth = 	 "",
  url =		 "ftp://ftp.cs.princeton.edu/pub/iburg.tar.Z"
}

@Manual{bradley87,
  title = 	 "68000 Unix Forth-83",
  author =	 "Mitch Bradley",
  year =	 "1987",
  note =	 "Available via ftp, newer versions commercial"
}

@Manual{patel90,
  title = 	 "TILE Release 2.1",
  author =	 "Mikael Patel",
  year =	 "1990",
  note =	 "Available via ftp from any GNU archive site"
}

@InProceedings{wall91asplos,
  author = 	"David W. Wall",
  title = 	"Limits of Instruction-Level Parallelism",
  crossref = 	"asplos91",
  pages = 	"176--188",
  annote = 	"Compares a 64-skalar computer with perfect branch and
		 jump prediction, alias analysis and register
		 renaming, and more realistic alternatives."
}

@InProceedings{bhandarkar&clark91,
  author = 	"Dileep Bhandarkar and Douglas W. Clark",
  title = 	"Performance from Architecture: Comparing a RISC and a
		 CISC with Similar Hardware Organization",
  crossref = 	"asplos91",
  pages = 	"310--319"
}

@InProceedings{Appel&Li91,
  author =	"Andrew W. Appel and Kai Li",
  title =	"Virtual Memory Primitives for User Programs",
  crossref = 	"asplos91",
  pages =	"96--107"
}

@Proceedings{asplos91,
  key =		"ASPLOS-IV",
  title = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-IV)",
  booktitle = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-IV)",
  year = 	"1991",
}

@InProceedings{thornton64,
  author = 	"J. E. Thornton",
  title = 	"Parallel Operation in {Control Data~6600}",
  booktitle = 	"AFIPS Fall Joint Computer Conference",
  year = 	"1964",
  pages =	"33-40"
}


@Article{tomasulo67,
  author = 	"R. M. Tomasulo",
  title = 	"An Efficient Algorithm for Exploiting Multiple
		 Arithmetic Units",
  journal = 	ibmjrd,
  year = 	"1967",
  volume = 	"11",
  number = 	"1",
  pages = 	"25--33"
}


@Book{thornton70,
  author = 	"J. E. Thornton",
  title = 	"Design of a Computer",
  publisher = 	"Scott, Foresman",
  year = 	"1970",
  address = 	"Glenview, Ill."
}

@Book{aho+86,
  author = 	"Alfred V. Aho and Ravi Sethi and Jeffrey D. Ullman",
  title = 	"Compilers. Principles, Techniques, and Tools",
  publisher = 	"Addison-Wesley",
  year = 	"1986"
}

@Book{ellis85,
  author = 	"John R. Ellis",
  title = 	"Bulldog: A Compiler for {VLIW} Architectures",
  publisher = 	"MIT Press",
  year = 	"1985",
  annote = 	"The compiler uses traditional optimizations, trace
		 scheduling, memory-reference disambiguation, and
		 memory-bank disambiguation. The main problem of the
		 code generator is to place operations on the right
		 node (functional unit&register bank) to reduce movements
		 between nodes and functional unit contention.
		 Measurements are made on numeric programs. On most
		 programs a good speedup is achieved, but sometimes
		 the speedup is low or non-existent."
}

@InCollection{gross&ward91,
  author = 	"T. Gross and M. Ward",
  title = 	"The Suppression of Compensation Code",
  crossref =	"nicolau+91",
  year = 	"1991",
  pages = 	"260--273",
  annote = 	"Presents an algorithm to suppress the redundant
		 compensation code that na{\"i}ve trace scheduling
		 produces when it moves code across an if-statement.
		 This algorithm is more general than the solution
		 proposed in \cite{ellis85}. Measurements show a
		 reduction of 0\%--95\% in compensation code."
}

@InCollection{aiken&nicolau91,
  author = 	"A. Aiken and A. Nicolau",
  title = 	"A Realistic Resource-Constrained Software Pipelining
		 Algorithm",
  crossref =	"nicolau+91",
  year = 	"1991",
  pages = 	"274--290",
}

@InCollection{larus91,
  author = 	"J. R. Larus",
  title = 	"Parallelism in Numeric and Symbolic Programs",
  crossref =	"nicolau+91",
  year = 	"1991",
  pages = 	"331--349",
  annote = 	"Symbolic (i.e. nonnumeric programs like {\tt gcc})
		 profit only a little from loop parallelization techniques
		 developed for numeric programs."
}

@InCollection{pingali+91,
  author = 	"K. Pingali and M. Beck and R. Johnson and M.
		 Moudgill and P. Stodghill",
  title = 	"Dependence Flow Graphs: an Algebraic Approach to
		 Program Transformation",
  crossref =	"nicolau+91",
  year = 	"1991",
  pages = 	"445--467",
  annote = 	"Introduces Dependence Flow Graphs, a new
		 intermediate representation designed to facilitate
		 optimization and demonstrates its advantages using
		 constant propagation as an example. Dependence Flow
		 graphs are similar to data flow graphs, but include
		 memory manipulation operators and represents loops
		 explicitely."
}

@Book{nicolau+91,
  title = 	"Advances in Languages and Compilers for Parallel Processing",
  booktitle = 	"Advances in Languages and Compilers for Parallel Processing",
  publisher = 	"Pitman",
  year = 	"1991",
  editor = 	"Alexandru Nicolau and David Gelernter and Thomas Gross and
		 David Padua",
  series = 	"Research Monographs in Parallel and Distributed Programming",
  address = 	"London",
}

@Article{dongarra&jinds79,
  author = 	"J. J. Dongarra and A. R. Jinds",
  title = 	"Unrolling Loops in {Fortran}",
  journal = 	spe,
  year = 	"1979",
  volume = 	"9",
  number = 	"3",
  pages = 	"219--226",
  month = 	mar
}

@InProceedings{linn83,
  author = 	"Joseph L. Linn",
  title = 	"{SRDAG} compaction --- A Generalization of Trace
		 Scheduling to Increase the Use of Global Context Information",
  booktitle = 	"MICRO-16, The $16^{\it th}$ Annual Microprogramming Workshop",
  year = 	"1983",
  pages = 	"11--22",
  annote = 	"Generalizes Trace Scheduling to work on singly rooted
		 DAGs. The root basic block is compacted, then a new
		 SRDAG is selected and its root is compacted.
		 Unfortunately the paper contains only theoretical
		 results on the performance of the algorithm."
}

@InProceedings{smotherman+91,
  author = 	"Mark Smotherman and Sanjay Krishnamurthy and P. S.
		 Aravind and David Hunnicutt",
  title = 	"Efficient {DAG} Construction and Heuristic Calculation
		 for Instruction Scheduling",
  booktitle = 	"MICRO-24, $24^{\it th}$ Annual Intl. Symp. on
		 Microarchitecture",
  year = 	"1991",
  pages = 	"93--102",
  annote = 	"Gives an overview of the heuristics used in papers
		 on instruction scheduling and classifies, how they
		 can be computed (in a forward or backward pass or
		 during scheduling). Compares algorithms for building
		 the dependence DAG ($n^2$ and table-building) and
		 shows the interactions between DAG-building and
		 heuristic-computation."
}

@Article{freiburghouse74,
  author = 	"R. A. Freiburghouse",
  title = 	"Register Allocation Via Usage Counts",
  journal = 	cacm,
  year = 	"1974",
  volume = 	"17",
  number = 	"11",
  pages = 	"638--642",
  month = 	nov
}

@Article{adam+74,
  author = 	"Thomas L. Adam and K. M. Chandy and J. R. Dickson",
  title = 	"A Comparison of List Schedules for Parallel
		 Processing Systems",
  journal = 	cacm,
  year = 	"1974",
  volume = 	"17",
  number = 	"11",
  pages = 	"685--690",
  month = 	dec
}

@InProceedings{smith+90,
  author = 	"Michael D. Smith and Monica S. Lam and Mark A. Horowitz",
  title = 	"Boosting Beyond Static Scheduling in a Superscalar Processor",
  crossref = 	"isca90",
  pages = 	"344--354",
  annote = 	"Proposes the use of static scheduling and hardware
		 backup for speculative execution. Every instruction
		 has a tag that indicates wheter it is executed
		 speculatively. Looks as if the shadow registers have
		 to be addressed explicitely. The branch instruction
		 the commits or squashes the results of speculative
		 execution. They call this technique boosting. They
		 compare this scheme with one-branch speculation to a
		 dynamically scheduled machine, assuming no alias
		 detection in both cases. Boosting is slightly
		 better, although their scheduler was quite
		 restricted."
}

@Proceedings{isca90,
  key =		"ISCA-17",
  booktitle = 	"The $17^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  title = 	"The $17^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  year = 	"1990",
  OPTaddress = 	"Seattle",
  OPTmonth = 	jun,
}

@InProceedings{smith+89,
  author = 	"Michael D. Smith and Mike Johnson and Mark A. Horowitz",
  title = 	"Limits on Multiple Instruction Issue",
  crossref = 	"asplos89",
  pages = 	"290--302",
  annote = 	"Evaluates the instruction-level parallelism available
		 in various superscalar designs with hardware
		 scheduling and speculative execution. They emphasize
		 that instruction fetch is the worst bottleneck. This
		 result is questionable, as they employ only branch
		 prediction, but no branch target buffering (although
		 they call their prediction branch target buffer)."
}

@InProceedings{jouppi&wall89,
  author = 	"Norman P. Jouppi and David W. Wall",
  title = 	"Available Instruction-Level Parallelism for
		 Superscalar and Superpipelined Machines",
  crossref = 	"asplos89",
  pages = 	"272--282",
  annote = 	"Presents a nice framework for understanding instruction-level
		 parallelism. Current Risc Processors already exploit
		 a certain amount parallelism, e.g. when loading and
		 branching. They measure instruction-level
		 parallelism, but obviously only for basic block
		 scheduling. Therefore they get a discouraging result.
		 They also measure the effect of optimizations on
		 parallelism. They seem to be quite neutral."
}

@Proceedings{asplos89,
  key =		"ASPLOS-III",
  title = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-III)",
  booktitle = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-III)",
  year = 	"1989",
}

@Article{popescu+91,
  author = 	"Val Popescu and Merle Schulz and John Spracklen and
		 Gary Gibson and Bruce Lightner and David Isaman",
  title = 	"The {Metaflow} Architecture",
  journal = 	ieeemicro,
  year = 	"1991",
  pages = 	"10--13, 63--73",
  month = 	jun,
  annote = 	"Introduces hardware scheduling etc. Then a
		 microarchitecture to achieve it is described. It is
		 quite similar to \cite{sohi&vajapeyam87}, but has different
		 register renaming, that lends itself better to
		 speculative execution. At last they describe the
		 realisation in the Lightning processor, a 4-scalar
		 SPARC."
}

@InProceedings{sohi&vajapeyam87,
  author = 	"Gurindar S. Sohi and Sriram Vajapeyam",
  title = 	"Instruction Issue Logic for High-Performance,
		 Interruptable Pipelined Processors",
  crossref =	"isca87",
  pages = 	"27--34",
  note = 	"Newer version: \cite{sohi90}",
  annote =	"Discuss a microarchitecture for superscalar
		 execution. They explain it by transforming Tomasulo's
		 solution into their's. Their solution to precise
		 interrupts is to retire the instructions in order,
		 i.e. write the results back in order. This also
		 provides for speculative execution. Simulations show
		 that a CRAY-1 with these mechanisms, but no other
		 additional hardware is 51\% faster (window size 20).
		 Leaving out the most expensive part drops the speedup
		 to 31\%."
}

@Article{sohi90,
  author = 	"Gurindar S. Sohi",
  title = 	"Instruction Issue Logic for High-Performance,
		 Interruptable, Multiple Functional Unit, Pipelined
		 Processors",
  journal = 	ieeetc,
  year = 	"1990",
  volume = 	"39",
  number = 	"3",
  pages = 	"349--359",
  month = 	mar
}

@InProceedings{su&ding85,
  author = 	"Bogong Su and Shiyuan Ding",
  title = 	"Some Experiments in Global Microcode Compaction",
  crossref = 	"micro85",
  pages = 	"175--180",
  annote = 	"Describes a few global compaction techniques and
		 compares them by compacting a few example programs."
}

@InProceedings{patt+85a,
  author = 	"Yale N. Patt and {Wen-mei} Hwu and Michael Shebanow",
  title = 	"{HPS}, a New Microarchitecture: Rationale and Introduction",
  crossref = 	"micro85",
  pages = 	"103--108",
  annote = 	"CISC instructions are decoded into RISC instructions,
		 which are executed in parallel using dynamic
		 scheduling etc. This microengine is presented as a
		 restricted data flow machine."
}

@InProceedings{patt+85b,
  author = 	"Yale N. Patt and Stephen W. Melvin and {Wen-mei} Hwu
		 and Michael C. Shebanow",
  title = 	"Critical Issues Regarding {HPS}, a High Performance Microarchitecture",
  crossref = 	"micro85",
  pages = 	"109--116",
  annote = 	"Discusses in depth some of the issues in dynamic
		 scheduling hardware."
}

@Proceedings{micro85,
  key =		"MICRO-18",
  booktitle = 	"The $18^{th}$ Annual Workshop on Microprogramming
		 (MICRO-18)",
  title = 	"The $18^{th}$ Annual Workshop on Microprogramming
		 (MICRO-18)",
  year = 	"1985",
}

@InProceedings{hwu&patt87isca,
  author = 	"{Wen-mei} Hwu and Yale N. Patt",
  title = 	"Checkpoint Repair for Out-of-order Execution Machines",
  crossref =	"isca87",
  pages = 	"18--26",
  note =	"Newer version: \cite{hwu&patt87ieeetc}",
  annote = 	"Describes design issues in checkpoint mechanisms for
		 precise interrupts and speculative execution. Their
		 design uses backup register files and difference
		 techniques for main memory. Instructions can be
		 retired out-of-order, avoiding full window
		 conditions."
}

@Article{hwu&patt87ieeetc,
  author = 	"{Wen-mei} Hwu and Yale N. Patt",
  title = 	"Checkpoint Repair for High-Performance Out-of-order
		 Execution Machines",
  journal = 	ieeetc,
  year = 	"1987",
  volume = 	"36",
  number = 	"12",
  pages = 	"1496--1514",
  month = 	dec
}

@Article{anathana&long90,
  author = 	"Kasi Anathana and Fred Long",
  title = 	"Code Compaction For Parallel Architectures",
  journal = 	spe,
  year = 	"1990",
  volume = 	"20",
  number = 	"6",
  pages = 	"537--554",
  month = 	jun,
  annote = 	"Describes methods for exploiting instruction-level
		 parallelism based on code movement between basic
		 blocks and renaming. They describe their data
		 structures and list their algorithms. Unfortunately
		 the system seems to work only on toy problems."
}

@InProceedings{cooper85,
  author = 	"K. Cooper",
  title = 	"Analyzing aliases of reference formal parameters",
  booktitle = 	"Conference Record of the Twelfth ACM Symposium on
		 Principles of Programming Languages",
  year = 	"1985"
}

@InProceedings{coutant86,
  author = 	"D. S. Coutant",
  title = 	"Retargetable High-Level alias Analysis",
  booktitle = 	"Conference Record of the Thirteenth ACM Symposium on
		 Principles of Programming Languages",
  year = 	"1986"
}

@Book{stallings90,
  title = 	"Reduced Instruction Set Computers",
  publisher = 	"IEEE Computer Society Press",
  year = 	"1990",
  editor = 	"William Stallings",
  edition = 	"second",
  annote = 	"A collection of articles on RISCs, most of them
		 pretty old (before '87) and/or low-level"
}

@Article{chaitin+81,
  author = 	"Gregory J. Chaitin and Marc A. Auslander and Ashok K.
		 Chandra and John Cocke and Martin E. Hopkins and
		 Peter W. Markstein",
  title = 	"Register Allocation via Coloring",
  journal = 	complang,
  year = 	"1981",
  volume = 	"6",
  number = 	"1",
  pages = 	"45--57",
  note	=	"Reprinted in \cite{stallings90}",
  annote = 	"The seminal paper on coloring register allocation.
		 The spill code generation differs much from Chaitins
		 later paper and reminds me of hierarchical graph
		 coloring."
}

@InProceedings{shieh&papachristou89,
  author = 	"Jong-Jiann Shieh and Christos A. Papachristou",
  title = 	"On Reordering Instruction Streams for Pipelined
		 Computers",
  crossref =	"micro22",
  pages = 	"199--206",
  annote = 	"Another paper on basic block instruction scheduling."
}

@InProceedings{schwiegelshohn+89,
  author = 	"U. Schwiegelshohn and F. Gasperoni and K. Ebcio\u{g}lu",
  title = 	"On Optimal Loop Parallelization",
  crossref =	"micro22",
  pages = 	"141--147",
  annote = 	"Proves that there are loops that cannot be
		 parallelized optimally even with unlimited resources,
		 because they could demand more resources in every
		 iteration, exceeding any bound. Optimal
		 parallelization means, that the program executes in
		 the time given by its critical path length."
}

@InProceedings{nakatani&ebcioglu89,
  title = 	"{``Combining''} as a Compilation Technique for {VLIW} Architectures",
  author = 	"Toshio Nakatani and Kemal Ebcio\u{g}lu",
  crossref =	"micro22",
  pages = 	"43--55",
  annote = 	"They reduce the path length by combining operations
		 on immediate values. Combining is used in conjunction
		 with percolation scheduling, software pipelining and
		 loop unrolling on an interesting VLIW architecture.
		 In this architecture every instruction consists of a
		 decision tree with the operations on the edges and
		 condition code tests on the nodes. In this setting
		 combining results in a speedup of up to 18\%."
}

@Proceedings{micro22,
  booktitle =	"$22^{\it nd}$ Annual International Workshop on
		 Microprogramming and Microarchitecture (MICRO-22)",
  year = 	"1989"
}

@Book{kuck78,
  author = 	"David J. Kuck",
  title = 	"The Structure of Computers and Computations",
  publisher = 	"John Wiley \& Sons",
  year = 	"1978",
  volume = 	"1",
  annote = 	"A textbook on computer hardware and architecture.
		 Contains some interesting things that are now
		 reappearing (e.g. a chapter on tree-height
		 reduction)."
}

@Article{cocke88,
  author = 	"John Cocke",
  title = 	"The Search for Performance in Scientific Processors",
  journal = 	cacm,
  year = 	"1988",
  volume = 	"31",
  number = 	"3",
  pages = 	"250--253",
  month = 	mar,
  note = 	"Turing Award Lecture",
  annote =	"Contains among other things a description of the
		 Advanced Computer System (ACS)
		 project at IBM 1964--1968 (superscalar (1 branch, 3
		 integer, 2 fpadd, 1 fpmul, 2 memory), branch
		 prediction, compiler support); History of the 801"
}

@MastersThesis{stuerzlinger89,
  author = 	"Wolfgang St{\"u}rzlinger",
  title = 	"{C-Compiler f\"ur den VIP-Prozessor}",
  school="{Technische Universit\"{a}t Wien}",
  year = 	"1989",
}

@InProceedings{beaty+90,
  author = 	"Steven Beaty and Gearold Johnson and Darrell Whitley",
  title = 	"Motivation and Framework for Using Genetic Algorithms
		 for Microcode Compaction",
  crossref =	"micro23",
  note =	"Reprinted in: SIGmicro Newsletter, January 1991",
  pages = 	"117--124",
  annote = 	"Gives a good introduction in genetic algorithms and
		 applies them to (local) microcode compaction: The
		 chromosomes are the priority list of the instructions."
}

@InProceedings{nakatani&ebcioglu90,
  author = 	"Toshio Nakatani and Kemal Ebcio\u{g}lu",
  title = 	"Using a Lookahead Window in a Compaction-Based
		 Parallelizing Compiler",
  crossref =	"micro23",
  note =	"Reprinted in: SIGmicro Newsletter, January 1991",
  pages = 	"57--68"
}

@MastersThesis{ambrosch93,
  author = 	"Wolfgang Ambrosch",
  title = 	"{Analyse und Vergleich von Registerallokationsalgorithmen}",
  school = 	"{Technische Universit\"{a}t Wien}",
  year = 	"1993"
}

@MastersThesis{beer93,
  author = 	"Felix Beer",
  title = 	"{Globale Optimierung}",
  school = 	"{Technische Universit\"{a}t Wien}",
  year = 	"1993"
}

@TechReport{larus&ball92,
  author = 	"James R. Larus and Thomas Ball",
  title = 	"Rewriting Executable Files to Measure Program Behavior",
  institution = "University of Wisconsin Computer Sciences",
  year = 	"1992",
  number = 	"1083",
  ftp =		"primost.cs.wisc.edu",
  ftpfile =	"pub/rewriting-tr.ps.Z",
  annote = 	"Describes the advantages of inserting instrumentation
		 code after linking (not too convincing) and discusses
		 properties of the executable file format, that cause
		 or solve problems with this approach. Their method
		 does not insert instrumentation code at every edge of
		 the control flow graph and computes the missing
		 information from the rest. The most heavily used
		 edges can run at full speed."
}

@Article{diefendorff&allen92,
  author = 	"Keith Diefendorff and Michael Allen",
  title = 	"Organization of the {Motorola} 88110 Superscalar {RISC} Microprocessor",
  journal = 	ieeemicro,
  year = 	"1992",
  pages = 	"40--63",
  month = 	apr,
  annote = 	"The 88110 can issue two instruction per cycle using
		 ten functional units (two integer); It has special
		 graphics commands, an extra $32 \times 80$ register
		 file and fast (3-cycle) floating point. The
		 instructions are executed in-order, except for
		 branches and stores. Speculative execution is
		 supported through a history buffer."
}

@InProceedings{schuetz92,
  author = 	"Udo Sch{\"u}tz",
  title = 	"{Optimierung von Fadencode}",
  booktitle = 	"{FORTH-Tagung}",
  year = 	"1992",
  organization = 	"Forth Gesellschaft e.V.",
  address = 	"Rostock",
  annote = 	"Describes peephole optimization of Forth's threaded
		 code. While I doubt that the impact on the run time
		 justifies the effort, it may convince hackers to
		 abstain from optimizing the code into a mess."
}

@Article{nicolau89,
  author = 	"Alexandru Nicolau",
  title = 	"Run-Time Disambiguation: Coping with Statically
		 Unpredictable Dependencies",
  journal = 	ieeetc,
  year = 	"1989",
  volume = 	"38",
  number = 	"5",
  pages = 	"663--678",
  month = 	may,
  annote = 	"Static alias analysis often cannot determine whether
		 two memory accesses refer to the same location.
		 Conventionally the worst case is assumed, and the
		 resulting dependency prohibits good schedules.
		 Run-time Disambiguation assumes
		 the best case, and checks this assumption at
		 run-time. The application of this idea in the
		 {Bulldog} trace scheduling compiler is discussed. The
		 speedup achieved over {Bulldog} without RTD is up
		 to~7. The code expansion is about proportional to the
		 speedup, but can be reduced without too much effect
		 on speed by not applying RTD to rarely-executed parts
		 and by combining the routines handling the
		 exceptional cases."
}

@InProceedings{blanck&krueger92,
  author = 	"Greg Blanck and Steve Krueger",
  title = 	"The {SuperSPARC} Microprocessor",
  booktitle = 	"COMPCON: Digest of Papers",
  year = 	"1992",
  pages = 	"136--141",
  OPTorganization = 	"IEEE",
  annote = 	"The SuperSPARC (aka Viking) is a superscalar SPARC
		 implementation that can issue up to three
		 instructions per cycle. Which instruction can issue
		 together is determined by a set of 23 rules. Some
		 specialties: results from a load are available in the
		 next cycle; dependent integer instructions can be
		 issued in the same cycle, even a load can depend on
		 one integer instruction in the same cycle; ``hard''
		 instructions like SAVE issue alone."
}

@InProceedings{nicolau85,
  author = 	"Alexandru Nicolau",
  title = 	"Uniform Parallelism Exploitation in Ordinary Programs",
  booktitle = 	"1985 International Conference on
		 Parallel Processing",
  year = 	"1985",
  pages = 	"614--618",
  annote = 	"Describes the basic transformation rules and the
		 conceptual framework of Percolation Scheduling.
		 However, no guiding rules etc. are detailed."
}

@Article{smith&pleszkun88,
  author = 	"James E. Smith and Andrew R. Pleszkun",
  title = 	"Implementing Precise Interrupts in Pipelined Processors",
  journal = 	ieeetc,
  year = 	"1988",
  volume = 	"37",
  number = 	"5",
  pages = 	"562--573",
  month = 	may,
  annote = 	"After defining precise interrupts this papaer
		 describes several ways to achieve them in pipelined
		 machines: Plain in-order completion is slow, because
		 new instructions must wait longer for the results. To
		 resolve the problem, the paper presents in-order
		 completion with bypasses, history buffers and future
		 files (a shadow register file that keeps the
		 imprecise state). Stores should be issued immediately
		 and buffered in the memory unit to avoid performance
		 problems. A performance analysis is done (on a
		 high-latency model) and extensions to virtual memory,
		 cache memory, and vectors are discussed."
}

@Book{kane&heinrich92,
  author = 	"G. Kane and J. Heinrich",
  title = 	"{MIPS RISC} Architecture",
  publisher = 	"Prentice-Hall",
  year = 	"1992"
}

@Article{padua&wolfe86,
  author = 	"David A. Padua and Michael J. Wolfe",
  title = 	"Advanced Compiler Optimizations for Supercomputers",
  journal = 	cacm,
  year = 	"1986",
  volume = 	"29",
  number = 	"12",
  pages = 	"1184--1201",
  month = 	dec
}

@InProceedings{chaitin82,
  author = 	"G. J. Chaitin",
  title = 	"Register Allocation \& Spilling via Graph Coloring",
  crossref =	"sigplan82",
  pages = 	"98--105"
}

@Book{zech84,
  author = 	"Ronald Zech",
  title = 	"{Die Programmiersprache FORTH}",
  publisher = 	"Franzis",
  year = 	"1984",
  address = 	"M{\"u}nchen",
  edition = 	"First",
  note = 	"In German"
}

@InProceedings{briggs+92,
  author = 	"Preston Briggs and Keith D. Cooper and Linda Torczon",
  title = 	"Rematerialization",
  crossref =	"sigplan92",
  pages = 	"311--321",
  annote = 	"Some values are cheaper to recompute than to spill.
		 This paper describes a framework for exploiting
		 these benefits. It consists of using static single
		 assignment form for analysis, splitting live ranges and
		 modifications to register coalescing. The results are
		 positive, but it is not clear how significant they
		 are."
}

@InProceedings{proebsting&fischer92,
  author = 	"Todd A. Proebsting and Charles N. Fischer",
  title = 	"Probabilistic Register Allocation",
  crossref =	"sigplan92",
  pages = 	"300--310",
  annote = 	"The probability that a value survives in a register
		 during an execution path is used for deciding which
		 value to hold in a register and which to spill.
		 Empirical data is presented, but unfortunately no
		 comparison with other approaches. The algorithm seems
		 to be very slow."
}

@InProceedings{rau+92,
  author = 	"B. R. Rau and M. Lee and P. P. Tirumalai and M. S. Schlansker",
  title = 	"Register Allocation for Software Pipelined Loops",
  crossref =	"sigplan92",
  pages = 	"283--299",
  annote = 	"The problem of register allocation in modulo
		 scheduled loops is to allocate the registers in a way
		 that minimizes register idle time. The constraints
		 on allocations depend on the code generation strategy
		 (hardware support, preconditioning vs.\ multiple loop
		 exits etc.). Registers are heuristically
		 allocated to lifetimes in a heuristically determined
		 order. The best heuristics work very well and produce
		 allocations at or near the lower bound. Therefore the
		 authors recommend choosing a code generation strategy
		 that minimizes the produced code."
}

@InProceedings{mueller&whalley92,
  author = 	"Frank Mueller and David B. Whalley",
  title = 	"Avoiding Unconditional Jumps by Code Replication",
  crossref =	"sigplan92",
  pages = 	"322--330",
  annote = 	"Nearly all unconditional jumps can be eliminated by
		 code replication. This expands compiled C code by
		 about 50\%, but reduces the number of executed
		 instructions and the cache work (miss ratio increases
		 slightly). There are some nontrivial problems in this
		 technique, which are solved heuristically in this
		 paper."
}

@InProceedings{granlund&kenner92,
  author = 	"Torbj{\"o}rn Granlund and Richard Kenner",
  title = 	"Eliminating Branches using a Superoptimizer and the
		 GNU C compiler",
  crossref =	"sigplan92",
  pages = 	"341--352",
  annote = 	"A superoptimizer is used to generate optimal code
		 fragments for use by the GNU C compiler. The
		 fragments are for conditional expressions for the
		 RS/6000. Their superoptimizer uses an interpreter
		 instead of the target machine and is therefore
		 machine-independent."
}

@InProceedings{brooks+92,
  author = 	"Gary Brooks and Gilbert J. Hansen and Steve Simmons",
  title = 	"A New Approach to Debugging Optimized Code",
  crossref =	"sigplan92",
  pages = 	"1--11",
  annote = 	"Stepping through the code is performed by
		 highlighting the active source code, in optimized
		 execution order. Optimization is not made
		 transparent. The paper describes a compiler-debugger
		 interface for this purpose, but the description seems
		 to be a little too specific (there's no
		 explanation what is necessary for the approach
		 and what's specific to the system)."
}

@InProceedings{ramsey&hanson92,
  author = 	"Norman Ramsey and David R. Hanson",
  title = 	"A Retargetable Debugger",
  crossref =	"sigplan92",
  pages = 	"22--31",
  annote = 	"Postscript is used for communication between compiler
		 and debugger."
}

@InProceedings{tjiang&hennessy92,
  author = 	"Steven W. K. Tjiang and John L. Hennessy",
  title = 	"Sharlit---A tool for building optimizers",
  crossref =	"sigplan92",
  pages = 	"82--93",
  annote = 	"Sharlit is a tool for writing data flow analysers. It
		 can work on simple one-instruction-one-node flow graphs.
		 Flow graph simplification rules are used to reduce
		 the run-time and space cost of the analyser."
}

@InProceedings{tan&lin92,
  author = 	"Jichang Tan and I-Peng Lin",
  title = 	"Compiling Data Flow Analysis of Logic Programs",
  crossref =	"sigplan92",
  pages = 	"106--115",
  annote = 	"Data flow analysis is performed by compiling the
		 Prolog program to WAM code and interpreting the WAM
		 code in a different way, i.e.\ abstract
		 interpretation at the WAM code level. If compilation
		 time to WAM code is not counted, they get speedups of
		 14--575 times over the Aquarius Prolog compiler on
		 small programs, but the compilation to the WAM takes
		 as much time as the analysis on the Aquarius Prolog
		 compiler."
}

@InProceedings{jaffar+92,
  author = 	"Joxan Jaffar and Spiro Michaylov and Peter J. Stuckey
		 and Roland H. C. Yap",
  title = 	"An Abstract Machine for CLP($\cal R$)",
  crossref =	"sigplan92",
  pages = 	"128--139"
}

@InProceedings{hendren+92sigplan,
  author = 	"Laurie J. Hendren and Joseph Hummel and Alexandru Nicolau",
  title = 	"Abstractions for Recursive Pointer Data Structures:
		 Improving the Analysis and Transformation of
		 Imperative Programs",
  crossref =	"sigplan92",
  pages = 	"249--260",
  annote = 	"Analysis of data structure using pointers can be
		 improved, if the data structure definitions contain
		 information about how the pointers in them can be
		 used. This paper presents such a notation (ADDS) and
		 describes its formal properties and applications."
}

@InProceedings{diwan+92,
  author = 	"Amer Diwan and Eliot Moss and Richard Hudson",
  title = 	"Compiler Support for Garbage Collection in a
		 Statically Typed Language",
  crossref =	"sigplan92",
  pages = 	"273--282",
  annote = 	"The compiler generates a descriptor for every garbage
		 collection point that enables the run-time system to
		 modify the appropriate values for copying garbage
		 collection. This includes derived values like
		 differences between pointers that may arise due to
		 optimizations."
}

@InProceedings{hoelzle+92,
  author = 	"Urs H{\"o}lzle and Craig Chambers and David Ungar",
  title = 	"Debugging Optimized Code with Dynamic Deoptimization",
  crossref =	"sigplan92",
  pages = 	"32--43",
  annote = 	"The debugger for SELF hides the effects of
		 optimization. This is achieved by not performing some
		 optimizations (e.g. tail-call optimization), and
		 restricting others to the areas between interrupt
		 points (one per procedure call or loop iteration). On
		 debugging the unoptimized (stack) state and methods
		 are recovered lazily (only the active ones).
		 Adding debugging information increases space usage by
		 2.2--3.3 times."
}

@InProceedings{proebsting92,
  author = 	 "Todd A. Proebsting",
  title = 	 "Simple and Efficient BURS Table Generation",
  crossref =	 "sigplan92",
  pages =	 "331--340",
  annote =	 "Describes an algorithm for generating tree parsing
		  automata from tree grammars. The description is very
		  detailed. Table compression techniques are used to
		  avoid huge tables. The paper describes a compression
		  technique that is both simpler and more effective
		  than earlier techniques. The paper also discusses
		  engineering issues in the implementation of the
		  algorithm and gives an empirical comparison with
		  previous work."
}

@Proceedings{sigplan92,
  key = 	"SIGPLAN '92",
  title = 	"SIGPLAN '92 Conference on Programming Language Design
		 and Implementation",
  booktitle = 	"SIGPLAN '92 Conference on Programming Language Design
		 and Implementation",
  year = 	"1992"
}

@Book{ting81,
  author = 	"C. H. Ting",
  title = 	"Systems Guide to fig-Forth",
  publisher = 	"Offete Enterprises, Inc.",
  year = 	"1981",
  address = 	"San Mateo, CA 94402",
  OPTedition = 	"First"
}

@InProceedings{lam&wilson92,
  author = 	"Monica S. Lam and Robert P. Wilson",
  title = 	"Limits of Control Flow on Parallelism",
  crossref =	"isca92",
  pages = 	"46--57",
  annote = 	"Or rather: How control flow limits instruction-level
		 parallelism. Several means for circumventing these
		 limits are discussed: unidirectional speculative
		 execution, control dependence analysis (treatment of
		 code interrupted by e.g.\ an if-statement), and
		 executing multiple flows of control. Machine models
		 based on combinations of these techniques were
		 simulated and empirical results on non-numeric
		 benchmarks are listed. The simulations relax nearly
		 all scheduling constraints except true data
		 dependences and model-related control-flow
		 constraints. The model-related dependences are not
		 entirely realistic, e.g. no reordering of branches
		 and only one branch/cycle without multiple flow of
		 control. The parallelism of the non-numeric
		 benchmarks on the SP-CD-MF model is limited to
		 18-402 (HM 39.6)."
}

@InProceedings{fernandes&barbosa92,
  author = 	"Edil S. T. Fernandes and Fernando M. B. Barbosa",
  title = 	"Effects of Building Blocks on the Performance of
		 Super-Scalar Architectures",
  crossref =	"isca92",
  pages = 	"36--45",
  annote = 	"De mortuis ..."
}

@InProceedings{franklin&sohi92,
  author = 	"Manoj Franklin and Gurindar S. Sohi",
  title = 	"The Expandable Split Window Paradigm For Exploiting
		 Fine-Grain Parallelism",
  crossref =	"isca92",
  pages = 	"58--67",
  annote = 	"This new architecture is situated between superscalar and
		 shared-memory machine. The guiding principle of this
		 machine is decentralization, in order to be
		 expandable (scalable). It consists of several
		 {\em stages}, which are nearly full processors. Each
		 stage processes a small chunk of code ({\em basic
		 window}) at a time, e.g. a basic block or an
		 if-statement. The stages are organized in a queue,
		 processing (probably) consecutive windows. The basic
		 windows are executed in a pipelined fashion: The
		 results needed by later basic windows are passed
		 along, so they can execute concurrently with earlier
		 basic windows. An architecture along this framework
		 has been simulated running the SPEC benchmarks and
		 the results on ordinary code are comparable to other
		 superscalar processors. With a bit of scheduling even
		 larger parallelism can be achieved."
}

@InProceedings{degloria&farabischi92,
  author = 	"De Gloria, Alessandro and Paolo Faraboschi",
  title = 	"Instruction-level Parallelism in {Prolog}: Analysis and
		 Architectural Support",
  crossref =	"isca92",
  pages = 	"224--233",
  annote = 	"Traces of code produced by the Aquarius Prolog compiler
		 displays the following properties: 32\% of the
		 executed instructions access memory. Branch
		 prediction accuracy is 86\%. The average basic block
		 length is 6~instructions. Using trace scheduling
		 increases the size of the scheduled code chunks to
		 11--12 and increases parallelism by 30\% over basic
		 block scheduling. The overall speedup on a multi-ALU
		 machine over a sequential machine is up to 1.95, with
		 a two-ALU machine achieving a speedup of 1.89. A
		 prototype VLIW processor based on this work delivers
		 2.1~MLIPS (30~MHz)."
}

@InProceedings{lenoski+92,
  author = 	"Daniel Lenoski and James Laudon and Truman Joe and
		 David Nakahira and Luis Stevens and Anoop Gupta and
		 John Hennessy",
  title = 	"The {DASH} Prototype: Implementation and Performance",
  crossref =	"isca92",
  pages = 	"92--103",
  annote = 	"DASH is a larg-scale shared-memory multiprocessor.
		 The final goal is a 64-processor machine. The current
		 16-processor version shows speedup factors of 4--14
		 on parallel applications. DASH uses 4-processor
		 clusters; accesses to otrher clusters are served by a
		 directory mechanism."
}

@InProceedings{intrater&spillinger92,
  author = 	"Gideon Intrater and Ilan Spillinger",
  title = 	"Performance Evaluation of a Decoded Instruction Cache
		 for Variable Instruction-Length Computers",
  crossref =	"isca92",
  pages = 	"106--113",
  annote = 	"Decoded instruction caches for variable
		 instruction-length machines differ somewhat in their
		 behaviour from usual caches, because the addresses of
		 consecutive instructions cannot be simply mapped into
		 consecutive cache lines. The paper discusses and
		 evaluates various mapping schemes based on
		 suppressing low-order bits, associativity and line
		 size. Associativity is more important for decode
		 instruction caches. On the other hand, increasing
		 line size is not as useful, because only instructions
		 at the start of a line can be found in decoded
		 instruction caches."
}

@InProceedings{hirita+92,
  author = 	"Hiroaki Hirita and Kozo Kimura and Satoshi Nagamine
		 and Yoshiyuki Mochizuki and Akio Nishimura and
		 Yoshimori Nakase and Teiji Nishizawa",
  title = 	"An Elementray Processor Architecture with
		 Simultaneous Instruction Issuing from Multiple
		 Threads",
  crossref =	"isca92",
  pages = 	"136--145",
  annote = 	"A mixture of superscalar and shard-memory machine:
		 The machine has several functional units, several
		 register files and sequencing units. The functional
		 units are allocated to threads on a per-cycle basis.
		 Conflicts are being resolved using a priority scheme,
		 with the priorities of the threads changing very
		 often. Inter-thread communication is achieved through
		 queue registers. Iterations of a loop can be executed
		 in parallel on different threads. This architecture
		 achieves a speedup of up to 5.79 using 8 threads and
		 two load/store units on a ray-tracing program.
		 Trading threads for superscalar execution reduces
		 the performance of this application."
}

@InProceedings{yeh&patt92,
  author = 	"Tse-Yu Yeh and Yale N. Patt",
  title = 	"Alternative Implementations of Two-Level Adaptive
		 Branch Prediction",
  crossref =	"isca92",
  pages = 	"124--134",
  annote = 	"Two-level Adaptive Branch Prediction works by keeping
		 a history of the last $k$ taken/not-taken decisions
		 and using conventional (e.g.  two-bit) techniques
		 for predicting the behaviour following the history
		 pattern. Both the history and the pattern can be
		 maintained on a global or per-branch basis. All
		 variations result in similar performance when given
		 enough hardware, but per-branch history (12 bits) and
		 global patterns are least expensive. Two-Level
		 Adaptive Branch Prediction achieves 97\% prediction
		 accuracy for the SPEC '89 benchmarks."
}

@InProceedings{najjar+92,
  author = 	"Walid A. Najjar and W. Marcus Miller and A. P. Wim B{\"o}hm",
  title = 	"An Analysis of Loop Latency in Dataflow Execution",
  crossref =	"isca92",
  pages = 	"352--360"
}

@InProceedings{kurian+92,
  author = 	"Memory Latency Effects in Decoupled Architectures
		 with a Single Data Memory Module",
  title = 	"Lizyamma Kurian and Paul T. Hulina and Lee D. Coraor",
  crossref =	"isca92",
  pages = 	"236--245",
  annote = 	"Machines with an independent memory access processor
		 are faster than uniprocessors with cache on numeric
		 code, if the memory latency is low. If the latency is
		 high, the result depends on the locality of the
		 accesses."
}

@InProceedings{austin&sohi92,
  author = 	"Todd M. Austin and Gurindar S. Sohi",
  title = 	"Dynamic Dependency Analysis of Ordinary Programs",
  crossref =	"isca92",
  pages = 	"342--351",
  annote = 	"A \cite{wall91asplos}-type study. This one emphasizes
		 methods for building the dependence graph and what
		 can be measured with them. It also presents
		 parallelism profiles, i.e. parallelism/time diagrams.
		 Other interesting points are varying the renaming
		 capabilities and the instruction window size."
}

@InProceedings{olukotun+92,
  author = 	"Kunle Olukotun and Trevor Mudge and Richard Brown",
  title = 	"Performance Optimization of Pipelined Primary Caches",
  crossref =	"isca92",
  pages = 	"181--190",
  annote = 	"An example of how to optimize cache parameters (size,
		 pipeline depth etc.) for high performance."
}

@Proceedings{isca92,
  key =		"ISCA-19",
  title = 	"The $19^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  booktitle = 	"The $19^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  year = 	"1992"
}

@Article{tevet89,
  author = 	"Adin Tevet",
  title = 	"Symbolic Stack Addressing",
  journal = 	jfar,
  year = 	"1989",
  volume = 	"5",
  number = 	"3",
  pages = 	"365--379",
  annote = 	"A local variable mechanism that uses the data stack
		 for storage. The variables are accessed by {\tt PICK}
		 and {\tt POST} (its opposite), which means that the
		 compiler must keep track of the stack depth. Includes
		 source code for 8086 F83."
}

@InCollection{glass83,
  author = 	"Harvey Glass",
  title = 	"Towards a More Writable {Forth} Syntax",
  crossref =	"ouverson86",
  chapter = 	"21",
  pages = 	"169--181",
  note = 	"Reprinted from {\em Proceedings of the 1983 Rochester
		 Forth Applications Conference}"
}

@InCollection{perry86,
  author = 	 {Michael A. Perry},
  title = 	 {A 68000 {Forth} Assembler},
  crossref =	 {ouverson86},
  chapter =	 {23},
  pages =	 {193--201}
}

@Book{ouverson86,
  booktitle = 	 {Dr.\ Dobb's Toolbook of Forth},
  publisher = 	 {M\&T Books},
  year = 	 {1986},
  editor =	 {Marlin Ouverson},
  address =	 {Redwood City, CA 94063}
}

@Article{almy86,
  author = 	"Thomas Almy",
  title = 	"Compiling {Forth} for Performance",
  journal = 	jfar,
  year = 	"1986",
  volume = 	"4",
  number = 	"3",
  pages = 	"379--388",
  annote = 	"A batch Forth compiler for the 8086 and the Z80. It
		 produces machine code in executable files. It uses
		 peephole optimization and keeps up to two values from
		 the top of the stack in registers."
}

@Article{belinfante87,
  author = 	 "Johan G.F. Belinfante",
  title = 	 "S/K/ID: Combinators in Forth",
  journal =	 jfar,
  year =	 "1987",
  volume =	 "4",
  number =	 "4",
  pages =	 "555--580"
}

@Article{rose86,
  author = 	"Anthony Rose",
  title = 	"Design of a Fast 68000-Based Subroutine-Threaded
		 {Forth} with Inline Code \& an Optimizer",
  journal = 	jfar,
  year = 	"1986",
  volume = 	"4",
  number = 	"2",
  pages = 	"285--288",
  note =	"1986 Rochester Forth Conference",
  annote = 	"Inlines everything below a critical size and
		 peephole optimizes the code."
}



@InProceedings{hayes+87,
  author = 	"John R. Hayes and Martin E. Fraeman and Robert L.
		 Williams and Thomas Zaremba",
  title = 	"An Architecture for the Direct Execution of the
		 {Forth} Programming Language",
  crossref =	"asplos87",
  pages = 	"42--48",
  journal =	sigplan,
  OPTmonth = 	oct,
  OPTvolume =	"22",
  OPTnumber =	"10",
  annote = 	"The 32-bit FRISC processor features direct access to
		 the 4 top elements of both stacks, 4 user registers,
		 and the usual Forth processor features. It has two
		 16-element stack caches, that are maintained via
		 exceptions."
}

@Proceedings{asplos87,
  key =		"ASPLOS-II",
  title = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-II)",
  booktitle = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-II)",
  year = 	"1987",
}

@InProceedings{hasegawa&shigei85,
  author = 	 "Makoto Hasekawa and Yoshiharu Shigei",
  title = 	 "High-Speed Top-of-Stack Scheme for {VLSI} Processor: A
		  Management Algorithm and Its Analysis",
  pages =	 "48--54",
  booktitle =	 "International Symposium on Computer Archictecture (ISCA)",
  year =	 "1985"
}

@Article{wall92,
  author = 	"David W. Wall",
  title = 	"Experience with a Software-Defined Machine Architecture",
  journal = 	toplas,
  year = 	"1992",
  volume = 	"14",
  number = 	"3",
  pages = 	"299--338",
  month = 	jul,
  annote = 	"Discusses an assembler/linker system that does not
		 present the actual instruction set to the
		 compiler/user. The assembler and linker translate
		 this IL into machine code and optimize it. The bulk
		 of the paper discusses the optimizations performed in
		 the linker: interprocedural register allocation and
		 instruction scheduling. The instruction scheduler
		 also performs a little speculative execution."
}

@Article{jaffar+92,
  author = 	"Joxan Jaffar and Spiro Michaylov and Peter J. Stuckey
		 and Roland H. C. Yap",
  title = 	"The {CLP($\cal R$)} Language and System",
  journal = 	toplas,
  year = 	"1992",
  volume = 	"14",
  number = 	"3",
  pages = 	"339--395",
  month = 	jul
}

@Book{appel92,
  author = 	"Andrew W. Appel",
  title = 	"Compiling with Continuations",
  publisher = 	"Cambridge University Press",
  year = 	"1992"
}

@InProceedings{freudenberger&ruttenberg91,
  author = 	"Stefan M. Freudenberger and John C. Ruttenberg",
  title = 	"Phase Ordering of Register Allocation and Instruction
		 Scheduling",
  crossref =	"codegen91",
  pages = 	"146--170",
  annote = 	"They use a technique that reminds me of
		 coagulation \cite{morris91} to solve the instruction
		 scheduling/register allocation phase ordering
		 problem. A trace scheduler selects traces in
		 decreasing order of (expected) execution frequency
		 and then passes the traces to the instruction
		 scheduler which also performs trace-local register
		 allocation. The allocation decisions made by the
		 scheduler on earlier (i.e. more frequent) traces have
		 to be respected later. In contrast to coagulation
		 there is no register renaming, when scheduled parts
		 merge. They avoid repair code by preferencing values
		 into the same register. The results on their
		 architecture (a Multiflow Trace/300 VLIW machine) are
		 good."
}

@InProceedings{emmelmann91,
  author = 	"Helmut Emmelmann",
  title = 	"Code Selection by Regularly Controlled Term Rewriting",
  crossref =	"codegen91",
  pages = 	"3--29",
  annote = 	"Non-deterministic term rewriting is proposed as a
		 method for code selection. Instead of generating the
		 code through side effects of a tree parser, the
		 intermediate language is rewritten into the machine
		 code. This enables the application of multiple
		 rewrite rules, which makes possible a better factoring
		 of the code selection description. The paper also
		 describes an algorithm for processing such
		 descriptions and presents a few results."
}

@InProceedings{giegerich91,
  author = 	"Robert Giegerich",
  title = 	"Considerate Code Selection",
  crossref =	"codegen91",
  pages = 	"51--65",
  annote = 	"This approach promises to solve phase
		 ordering problems involving code selection. All
		 possible code selections are generated and later
		 passes decide which one is best. To make this
		 approach feasible, shared forests are used."
}

@InProceedings{boyland&emmelmann91,
  author = 	 {John Boyland and Helmut Emmelmann},
  title = 	 {Discussion: Code Generator Specification Techniques (Summary)},
  crossref =	"codegen91",
  pages = 	"66-69"
}

@Book{codegen91,
  booktitle = 	"Code Generation --- Concepts, Tools, Techniques",
  year = 	"1991",
  OPTaddress = 	"Schlo{"s} Dagstuhl",
  publisher = 	"Springer",
  editor = 	"Robert Giegerich and Susan L. Graham",
  series = 	"Workshops in Computing",
}

@unpublished(haas92,
author="Mike Haas",
title="Re: Addressable Stacks?",
note="Usenet news group comp.lang.forth, message {``BprEnu.7AH@starnine.com''}",
month=jun,
year="1992",
annote="JForth V3.0 keeps up to 5 values in registers."
)

@InProceedings{chambers&ungar89,
  author = 	"Craig Chambers and David Ungar",
  title = 	"Customization: Optimizing Compiler Technology for
		 {{\sc {Self}}}, a Dynamically-Typed Object-Oriented
		 Programming Language",
  booktitle = 	"SIGPLAN '89 Conference on Programming Language Design
		 and Implementation",
  year = 	"1989",
  pages = 	"146--160"
}

@Book{ungar87,
  author = 	"David Ungar",
  title = 	"The Design and Evaluation of a High-Performance
		 Smalltalk System",
  publisher = 	"MIT Press",
  year = 	"1987"
}

@InProceedings{krall&berger92,
  author = 	"Andreas Krall and Thomas Berger",
  title = 	"Fast {Prolog} with a {VAM$_{1p}$} based {Prolog} Compiler",
  crossref =	"plilp92",
  pages = 	"245--259"
}

@Book{plilp92,
  booktitle = 	"Programming Language Implementation and Logic
		 Programming (PLILP '92)",
  publisher = 	"Springer LNCS~631",
  year = 	"1992"
}

@Article{tanenbaum+83,
  author = 	"Andrew S. Tanenbaum and Hans van Staveren and E. G.
		 Keizer and Johan W. Stevenson",
  title = 	"A Practical Tool Kit for Making Portable Compilers",
  journal = 	cacm,
  year = 	"1983",
  volume = 	"26",
  number = 	"9",
  pages = 	"654--660",
  month = 	sep,
  annote = 	"Describes the Amsterdam Compiler Kit and all its
		 phases."
}

@Article{cytron+91,
  author = 	"Ron Cytron and Jeanne Ferrante and Barry K. Rosen and
		 Mark N. Wegman and F. Kenneth Zadeck",
  title = 	"Efficiently Computing Static Single Assignment form
		 and the Control Dependence Graph",
  journal = 	toplas,
  year = 	"1991",
  volume = 	"13",
  number = 	"4",
  pages = 	"451--490",
  month = 	oct
}

@InProceedings{koopman92,
  author = 	"Koopman, Jr., Philip J.",
  title = 	"A Preliminary Exploration of Optimized Stack Code
		 Generation",
  booktitle = 	"1992 Rochester Forth Conference",
  year = 	"1992",
  abstract =	"This paper presents an experimental code generator
		 that performs intra-block stack scheduling for a
		 stack-based execution model. For small test programs,
		 91\% to 100\% of redundant local variable accesses were
		 eliminated using this compiler.  Compiled intra-block
		 stack scheduling and hand-performed global stack
		 scheduling show that significant opportunities exist
		 to keep temporary variable values on the expression
		 evaluation stack when compiling conventional
		 languages."
}

@Article{chang+92,
  author = 	"Pohua P. Chang and Scott A. Mahlke and William Y.
		 Chen and {Wen-mei} W. Hwu",
  title = 	"Profile-guided Automatic Inline Expansion for {C} Programs",
  journal = 	spe,
  year = 	"1992",
  volume = 	"22",
  number = 	"5",
  pages = 	"349--369",
  month = 	may
}

@PhdThesis{briggs92,
  author = 	"Preston Briggs",
  title = 	"Register Allocation via Graph Coloring",
  school = 	"Rice University",
  year = 	"1992",
  address = 	"Houston"
}

@InProceedings{hendren+92cc,
  author = 	"Laurie J. Hendren and Guang R. Gao and Erik R. Altman
		 and Chandrika Mukerji",
  title = 	"A Register Allocation Framework Based on Hierarchical
		 Cyclic Interval Graphs",
  crossref =	"cc92",
  pages = 	"176--191",
  annote = 	"A new register allocation algorithm. It works well
		 for inner loops and can be generalized for more
		 complex control structures. It's performance in these
		 cases is not yet clear."
}

@InProceedings{griesemer92,
  author = 	"Robert Griesemer",
  title = 	"Scheduling Instructions by Direct Placement",
  crossref =	"cc92",
  pages = 	"229--235",
  annote = 	"This scheduling algorithm tries to minimize compile
		 time. It does not build a data dependence graph and
		 it does not compute information on this graph in
		 extra passes. Instead, it takes the instruction as
		 generated by code selection, and finds the first
		 idle cycle after their earliest execution time (EET)
		 and places the instruction there. This produces the
		 same schedule as list scheduling with EET as primary
		 heuristic and original order a tie-breaker."
}

@InProceedings{emmelmann92,
  author = 	"Helmut Emmelmann",
  title = 	"Testing Completeness of Code Selector Specifications",
  crossref =	"cc92",
  pages = 	"163--175"
}

@Proceedings{cc92,
  booktitle = 	"Compiler Construction (CC'92)",
  title = 	"Compiler Construction (CC'92)",
  key =		"CC'92",
  year = 	"1992",
  publisher = 	"Springer LNCS~641",
  address = 	"Paderborn",
}

@InProceedings{rogers&li92,
  author = 	"Anne Rogers and Kai Li",
  title = 	"Software Support for Speculative Loads",
  crossref =	"asplos92",
  pages = 	"38--50",
  annote = 	"Speculative loads bypass the cache, are scoreboarded,
		 and set a poison bit associated with the result
		 register, if the load causes an exception. When the
		 loaded value is used, poison causes an exception."
}

@InProceedings{mahlke+92,
  author = 	"Scott A. Mahlke and William Y. Chen and {Wen-mei} W.
		 Hwu and B. Ramakrishna Rau and Michael S. Schlansker",
  title = 	"Sentinel Scheduling for {VLIW} and Superscalar Processors",
  crossref =	"asplos92",
  pages = 	"238--247",
  annote = 	"Attacks the problem of speculatively executing
		 trapping instructions. An exception generated by a
		 speculative instruction is noted in a tag associated
		 with the result register. These tags are propagated
		 until checked by a sentinel instruction, which then
		 traps. Recovery is performed by restarting the code
		 from the original exception-causing instruction. Of
		 course this poses heavy restrictions on the register
		 allocator. Also, nothing ensures that the exceptions
		 are taken in the correct order."
}

@InProceedings{fisher&freudenberger92,
  author = 	"Joseph A. Fisher and Stefan M. Freudenberger",
  title = 	"Predicting Conditional Branch Directions From
		 Previous Runs of a Program",
  crossref =	"asplos92",
  pages = 	"85--95",
  annote = 	"``Instructions per mispredicted branch'' is proposed
		 as a more meaningful measure than the correct prediction
		 percentage. Programs from the SPEC89 suite and a few
		 others are measured with different data sets and
		 static branch prediction. Integer programs (and
		 spice2g6) have one misprediction for every 35--170
		 instructions, numeric software has 250-7500
		 instructions/misprediction. Using the wrong data set
		 for prediction often halves the prediction accuracy,
		 and reduces it to 12\% for spice2g6. These problems
		 can usually be avoided by using several different data
		 sets."
}

@InProceedings{smith+92,
  author = 	"Michael D. Smith and Mark Horowitz and Monica S. Lam",
  title = 	"Efficient Superscalar Performance Through Boosting",
  crossref =	"asplos92",
  pages = 	"248--259",
  annote = 	"An extension of the work of \cite{smith+90}.
		 Recovery of state at exceptions is now done by
		 compiler-generated code. The global scheduling scheme
		 they use is a variant of trace scheduling, used after
		 register allocation. They have reduced the hardware
		 cost of their scheme: One level of backup hardware
		 suffices for several levels of branch prediction.
		 This requires a more intelligent compiler, and
		 restricts scheduling freedom slightly. They give
		 experimental data for a simulated degree-2
		 superscalar machine. They achieve speedups of 1.5
		 over a scalar machine and 1.2 over nonspeculative
		 execution."
}

@InProceedings{andrews&sand92,
  author = 	 "Kristy Andrews and Duane Sand",
  title = 	 "Migrating a {CISC} Computer Family onto {RISC} via
		  Object Code Translation",
  crossref =	 "asplos92",
  pages =	 "213--222",
  annote =	 "They use binary translation for migrating existing
		  Tandem Nonstop (TNS) code to the new MIPS-based
		  Tandems. The TNS is a mixture of stack machine and
		  register machine (registers can be addressed
		  relative to a register stack pointer (RP) or
		  absolutely); one of the challenges of the
		  translation is predicting the value of RP at every
		  point in the program. If the prediction is wrong,
		  the translated program drops into interpretive mode
		  (for a time). The paper emphasizes, that the
		  resulting translated code can be debugged at the
		  original machine level. The resulting code on an
		  R3000 is about 5 times larger and about 3 times
		  faster than the original code on a CLX~800, a
		  machine based on similar technology."
}

@Proceedings{asplos92,
  key =		"ASPLOS-V",
  title = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-V)",
  booktitle = 	"Architectural Support for Programming Languages and
		 Operating Systems (ASPLOS-V)",
  year = 	"1992",
}

@InProceedings{robertson92,
  author = 	 {Alan M. Robertson},
  title = 	 {A 448 Byte Forth Multitasking Kernel},
  crossref =	 euroforth92,
  pages =	 {55--59}
}

@InProceedings{charlton91,
  author = 	 {Gordon Charlton},
  title = 	 {{FOSM}, a {FOrth String Matcher}},
  booktitle = 	 {EuroForml '91 proceedings},
  year =	 {1991}
}

@InProceedings{charlton92,
  author = 	 {Gordon Charlton},
  title = 	 {{FOSM}, a {FOrth String Matcher}, continued},
  crossref =	 {euroforth92},
  pages =	 {113--122}
}

@Proceedings{euroforth92,
  key = 	"EuroForth~'92",
  title = 	"EuroForth~'92",
  booktitle = 	"EuroForth~'92",
  year = 	"1992",
  organization = 	"MicroProcessor Engineering",
  address = 	"Southampton, England"
}

@InProceedings{jakeman96,
  author = 	 {C.M. Jakeman},
  title = 	 {Portable Back-tracking In ANS Forth},
  booktitle = 	 {FORML '96 Proceedings},
  year =	 1996,
  url =		 {ftp://ftp.taygeta.com/pub/Forth/Applications/fosm1v1.zip}
}

@Misc{beusterien92,
  author = 	"Paul Beusterien",
  howpublished = 	"Personal communication",
  year = 	"1992",
  annote = 	"The Harris C compiler performs register reallocation
		 during instruction scheduling to reduce anti- and
		 output dependences."
}

@Misc{briggs92email,
  author = 	"Preston Briggs",
  howpublished = 	"Personal communication",
  year = 	"1992",
  annote = 	"Register selection does not pay off for register
		 allocation alone"
}

@InProceedings{auslander&hopkins82,
  author = 	"Marc Auslander and Martin Hopkins",
  title = 	"An Overview of the {PL.8} Compiler",
  crossref =	"sigplan82",
  pages = 	"22-31"
}

@Proceedings{sigplan82,
  key =		"SIGPLAN~'82",
  title = 	"SIGPLAN~'82 Symposium on Compiler Construction",
  booktitle = 	"SIGPLAN~'82 Symposium on Compiler Construction",
  year = 	"1982"
}

@InProceedings{vanhentenryck&deville91,
  author = 	"Van Hentenryck, Pascal and Yves Deville",
  title = 	"The Cardinality Operator: A new Logical Connective
		 for Constraint Logic Programming",
  crossref =	"iclp91",
  pages = 	"745--759",
  annote = 	"The cardinality operator is a very powerful
		 metaconstraint. Basically, it takes a number of
		 constraints, an upper and lower bound, and it
		 succeeds if the number of successful constraints lies
		 between the lower and the upper bound. This contains
		 the disjunction, negation and other connectives of
		 constraints."
}

@InProceedings{aggoun&beldiceanu91,
  author = 	"Abderrahmane Aggoun and Nicolas Beldiceanu",
  title = 	"Overview of the {CHIP} Compiler System",
  crossref =	"iclp91",
  pages = 	"775--789"
}

@Proceedings{iclp91,
  key =		"ICLP-8",
  title = 	"Eighth International Conference on Logic
		 Programming (ICLP-8)", 
  booktitle = 	"Eighth International Conference on Logic
		 Programming (ICLP-8)", 
  year = 	"1991",
  publisher = 	"MIT Press"
}

@InProceedings{taylor90,
  author = 	"Andrew Taylor",
  title = 	"{LIPS} on a {MIPS}",
  crossref =	"iclp90",
  pages = 	"174--185"
}

@Proceedings{iclp90,
  key =		"ICLP-7",
  title = 	"Seventh International Conference on Logic
		 Programming (ICLP-7)", 
  booktitle = 	"Seventh International Conference on Logic
		 Programming (ICLP-7)", 
  year = 	"1990",
  publisher = 	"MIT Press"
}

@inproceedings  (dincbas+88ecai,
author =        "Dincbas, M. and Simonis, H. and Van Hentenryck, P.",
title =         "{Solving the Car Sequencing Problem in Constraint Logic
Programming}",
booktitle =     "European Conference on Artificial Intelligence (ECAI-88)",
address =       "M{\"u}nchen",
year =          "1988"
)

@Article{paysan91,
  author = 	"Bernd Paysan",
  title = 	"{Ein optimierender Forth-Compiler}",
  journal = 	"Vierte Dimension",
  year = 	"1991",
  volume = 	"7",
  number = 	"3",
  pages = 	"22--25",
  month = 	sep
}

@InCollection{bundy+84,
  author = 	"Alan Bundy and Ben Du Bolay and Jim Howe and Gordon
		 Plotkin",
  title = 	"How to Get a {Ph.D.} in {AI}",
  booktitle = 	"Artificial Intelligence---Tools, Techniques, and
		 Applications",
  chapter = 	"5",
  publisher = 	"Harper\&Row",
  year = 	"1984",
  editor = 	"Tim O'Shea and Marc Eisenstadt",
  pages = 	"139--154",
  address = 	"New York"
}

@Article{march91,
  author = 	"Salvatore T. March",
  title = 	"Editorial Policy",
  journal = 	"ACM Computing Surveys",
  year = 	"1991",
  volume = 	"23",
  number = 	"2",
  pages = 	"133--141",
  month = 	jun
}

@PhdThesis{smith92,
  author = 	"Michael David Smith",
  title = 	"Support for Speculative Execution in High-Performance
		 Processors",
  school = 	"Stanford University",
  year = 	"1992",
  annote = 	"The main improvement over \cite{smith+92} is the
		 description of the global instruction scheduling
		  algorithm. It is based on trace scheduling, but
		  tries to minimize compensation code. It also does a
		  bit of scheduling across loop back-edges. He also
		  generalizes boosting to ``opportunistic instruction
		  scheduling'', but does not explain it well."
}

@InProceedings{chen+91,
  author = 	"William Y. Chen and Scott A. Mahlke and Pohua P.
		 Chang and {Wen-mei} W. Hwu",
  title = 	"Data Access Microarchitectures for Superscalar
		 Processors with Compiler-Assisted Data Prefetching",
  crossref =	"micro24",
  pages = 	"69--73",
  annote = 	"A small cache (1K) with a prefetch buffer (32
		 entries) is better than a larger cache (2K) for
		 programs with compiler-generated prefetch code. They
		 also give data on loads that depend on other loads
		 (making prefetching ineffective): 20--50\% of the
		 loads are restricted by that in the analysed traces."
}

@InProceedings{su&wang91,
  author = 	"Bogong Su and Jian Wang",
  title = 	"{GURPR*}: A New Global Software Pipelining Algorithm",
  crossref =	"micro24",
  pages = 	"212--216",
  annote = 	"A software pipelining algorithm for loops with
		 embedded conditionals, that produces smaller and
		 faster code than GURPR and sometimes much smaller
		 (and slightly smaller) code than perfect pipelining."
}

@Proceedings{micro24,
  key =		"MICRO-24",
  title =	"24th International Symposium on Microarchitecture (MICRO-24)",
  booktitle =	"24th International Symposium on Microarchitecture (MICRO-24)",
  year = 	"1991"
}

@InProceedings{sweany&beaty90,
  author = 	"Philip Sweany and Steven Beaty",
  title = 	"Post-Compaction Register Assignment in a Retargetable
		  Compiler",
  crossref =	"micro23",
  pages = 	"107--116",
  annote = 	"Discusses why prepass scheduling is usually
		  preferable. For some architectures prepass
		  scheduling means that scheduling must be repeated
		  for every spill code insertion pass."
}

@Proceedings{micro23,
  key =		"MICRO-23",
  title =	"23rd Annual Workshop on
		 Microprogramming and Microarchitecture (MICRO-23)",
  booktitle =	"23rd Annual Workshop on
		 Microprogramming and Microarchitecture (MICRO-23)",
  year = 	"1990"
}

@TechReport{fruehwirth92,
  author = 	 "Thom Fr{\"u}hwirth",
  title = 	 "Constraint Simplification Rules",
  institution =  "ECRC",
  year = 	 "1992",
  number = 	 "ECRC-92-18?"
}

@InProceedings{leprovost&wallace92,
  author=	{Le Provost, Thierry and Wallace, Mark},
  title=	{Domain Independent Propagation},
  pages=	{1004--1011},
  crossref=	{FGCS92},
  annote =	"Generalizes the lookahead mechanism of finite domains
		  to arbitrary domains: finit domains are generalized
		  to {\em basic formulae} expressible in the underlying
		  domain. The lookahead mechanism is generalised to
		  {\em generalized propagation} where a basic formula
		  that represents the solutions to the constraint as
		  close as possible is produced. The concept is
		  demonstrated by applying it to ordinary Prolog. The
		  basic formulae are ordinary Prolog bindings, which
		  (in contrast to finite domains) can express equality
		  of variables."
}

@Proceedings{FGCS92,
  title = 	{Proceedings of the International Conference on Fifth
		 Generation Computer Systems},  
  booktitle = 	{Proceedings of the International Conference on Fifth
		 Generation Computer Systems},
  year = 	{1992},
  publisher = 	{Association for Computing Machinery},
  address =     {ICOT, Japan},
}

@TechReport{leprovost&wallace92tr,
  author = 	 "Le Provost, Thierry and Mark Wallace",
  title = 	 "Generalised Constraint Propagation Over the CLP
		  Scheme",
  institution =  "ECRC",
  year = 	 "1992",
  number = 	 "ECRC-92-1",
  note = 	 "To appear in the Journal of Logic Programming",
  annote = 	 "Discusses generalised propagation in depth. In
		  particular, the paper gives a theoretical treatment
		  (unfortunately only for satisfaction-complete
		  domains) and discusses the implementation. It
		  presents topological branch and bound as a method
		  for implementing generalised propagation: It
		  terminates the computation of a propagation goal as
		  soon as it cannot generate new information."
}

@Article{sidebottom&havens92,
  author = 	 "Gregory Sidebottom and William S. Havens",
  title = 	 "Hierarchical Arc Consistency for Disjoint Real
		  Intervals in Constraint Logic Programming",
  journal = 	 "Computational Intelligence",
  year = 	 "1992",
  volume = 	 "8",
  number = 	 "4",
  pages = 	 "601--623"
}

@InProceedings{theobald+92,
  author = 	"Kevin B. Theobald and Guang R. Gao and Laurie J. Hendren",
  title = 	"On the limits of Program Parallelism and its Smoothability",
  crossref =	"micro25",
  pages = 	"10--19",
  annote = 	"Another limits study. Interesting new results: Memory
		  renaming is very important, finite instruction
		  windows limit parallelism to much less than window
		  size, and instruction-level parallelism is well
		  smoothable."
}

@InProceedings{vajapeyam&hsu92,
  author = 	"Sriram Vajapeyam and Wei-Chung Hsu",
  title = 	"On the Instruction-Level Characteristics of Scalar
		  Code in Highly-Vectorized Scientific Applications",
  crossref =	"micro25",
  pages = 	"20--28",
  annote = 	"An empirical analysis on the CRAY Y-MP. It's
		  significance evades me."
}

@InProceedings{chen+92,
  author = 	"Chien-Ming Chen and Yunn-Yen Chen and Chung-Ta King",
  title = 	"Branch Merging for Effective Exploitation of
		  Instruction-Level Parallelism",
  crossref =	"micro25",
  pages = 	"37--40"
}

@InProceedings{degloria+92,
  author = 	"Alessandro De Gloria and Paolo Faraboschi and Mauro Olivieri",
  title = 	"A Non-Deterministic Scheduler for a Software
		  Pipelining Compiler",
  crossref =	"micro25",
  pages = 	"41--44",
  annote = 	"Presents a mapping of the software pipelining problem
		  to a Boltzmann Machine (similar to simulated
		  annealing). The resuling algorithm has complexity
		  $O(n^2)$."
}

@InProceedings{chang&lang&shang92,
  author = 	"Meng-Chu Chang and Feipei Lang and Rung-ji Shang",
  title = 	"Exploiting Instruction-Level Parallelism with the
		  Conjugate Register File Scheme",
  crossref =	"micro25",
  pages = 	"29--32",
  annote = 	"Introduces a more programmable variation of Smith's
		  shadow structures. The more interesting part of the
		  paper describes the register allocator: A scheduling
		  pass determines the costs of introducing
		  antidependences, which are then used by the register
		  allocator as scheduling-conflict graph. The
		  scheduling proper is done in a postpass strategy."
}

@InProceedings{mahlke+92micro,
  author = 	"Scott A. Mahlke and David C. Lin and William Y. Chen
		  and Richard E. Hank and Roger A. Bringmann",
  title = 	"Effective Compiler Support for Predicated Execution
		  Using the Hyperblock",
  crossref =	"micro25",
  pages = 	"45--54",
  annote = 	"Discusses lots of issues involved in if-conversion,
		  i.e. elimination of conditional branches by using
		  predicated execution."
}

@InProceedings{moon&ebcioglu92,
  author = 	"Soo-Mook Moon and Kemal Ebcio\u{g}lu",
  title = 	"An Efficient Resource-Constrained Global Scheduling
		  Technique for Superscalar and {VLIW} processors",
  crossref =	"micro25",
  pages = 	"55--71",
  annote = 	"Discusses global scheduling in great detail. The
		  algorithm also includes  register renaming and a
		  limited form of combining. The paper distinguishes
		  global scheduling and software pipelining; it uses
		  enhanced pipeline scheduling for software
		  pipelining. The global scheduling algorithm is based on
		  list-scheduling, like most others. The main
		  heuristic is degree of speculativeness, which does
		  not need profiling information. In spite of this,
		  the results are impressive: Average speedup 4.8 for
		  a VLIW machine with 16 simultaneous operations and
		  16-way branching, with cache 3.7. The code expands
		  by a factor of 2.1, the scheduling time is less than
		  the rest of the compile time."
}

@InProceedings{huang+92,
  author = 	"Shih-Hsu Huang and Cheng-Tsung Hwang and Yu-Chin Hsu
		  and Yen-Jen Oyang",
  title = 	"A New Approach to Schedule Operations Across
		  Nested-ifs and Nested-loops",
  crossref =	"micro25",
  pages = 	"268--271",
  annote = 	"A global scheduling algorithm that first pushes
		  operations (not branches) down- and inwards and then
		  pushes as many upwards as is possible without making
		  the basic blocks longer."
}

@InProceedings{vegdahl92,
  author = 	"Steven R. Vegdahl",
  title = 	"A Dynamic-Programming Technique for Compacting Loops",
  crossref =	"micro25",
  pages = 	"180--188",
  annote = 	"Extends an algorithm for optimal basic block
		  scheduling to loop scheduling. The algorithm builds
		  a graph, where the nodes represent the set of already
		  executed instructions and the edges represent
		  groups executed in one cycle. An optimal solution is
		  found by using a shortest-path algorithm on the graph
		  (shortest-cycle for loops). The algorithm can handle
		  loops with up to 20--30 instructions."
}

@InProceedings{sweany&beaty92,
  author = 	"Philip H. Sweany and Steven J. Beaty",
  title = 	"Dominator-Path Scheduling --- A Global Scheduling Method",
  crossref =	"micro25",
  pages = 	"260--263",
  annote = 	"This is another scheduling algorithm similar to trace
		  scheduling. It's selling point is that it does not
		  need any code replication. Their prototype
		  implementation achieved a speedup of 8.3\% on an
		  RS6000."
}

@InProceedings{kiyohara&gyllenhaal92,
  author = 	"Tokuzo Kiyohara and John C. Gyllenhaal",
  title = 	"Code Scheduling for {VLIW}/Superscalar Processors with
		  Limited Register Files",
  crossref =	"micro25",
  pages = 	"197--201",
  annote = 	"Solves the scheduling/register allocation phase
		  ordering problem for unrolled loops by adding a bias
		  to the priority function that tends to discourage
		  overlap of iterations. The paper contains much
		  empirical data. The technique proves to be quite
		  effective."
}

@InProceedings{warter+92,
  author = 	"Nancy J. Warter and Grant E. Haab and Krishna
		  Subramanian and John W. Bockhaus",
  title = 	"Enhanced Modulo Scheduling for Loops with Conditional
		  Branches",
  crossref =	"micro25",
  pages = 	"170--179",
  annote = 	"Enhances modulo scheduling to loops with conditional
		  branches. In contrast to hierarchical reduction,
		  which preschedules the if-statements, enhanced
		  modulo scheduling schedules all instructions at the
		  same time and therefore avoids phase ordering
		  problems. The new method performs 18\% better than
		  hierarchical reduction for issue rates of 2--8.
		  Since the target machine can only do one conditional
		  branch/cycle, it does not as well as predicated
		  execution for high issue rates."
}

@InProceedings{allen+92,
  author = 	"V. H. Allen and J. Janardhan and R.M. Lee and M. Srinivas",
  title = 	"Enhanced Region Scheduling on a Program Dependence Graph",
  crossref =	"micro25",
  pages = 	"72--80",
  annote = 	"Enhanced region scheduling performs four
		  transformations, using the PDG \comment{an AST-like
		  representation} as representation. Code motion is
		  performed to redistribute parallelism. Software
		  pipelining increases parallelism within loops, loop
		  peeling increases parallelism outside loops.
		  Peephole compaction fine-tunes the parallelism.
		  Enhanced region scheduling achieves 62.6\% speedup
		  over sequential execution and 29.5\% speedup over (I
		  think) basic block scheduling on a degree 7 machine."
}

@InProceedings{capitanio+92,
  author = 	"Andrea Capitanio and Nikil Dutt and Alexandru Nicolau",
  title = 	"Partitioned Register Files for {VLIW}s: A Preliminary
		  Analysis of Tradeoffs",
  crossref =	"micro25",
  pages = 	"292--300",
  annote = 	"Tries to solve the problem of VLIWs hunger for
		  register ports: A simple VLIW model with partitioned
		  register files is proposed, the code is partitioned
		  using a search strategy (a fast version of simulated
		  annealing), cross-partition move instructions are
		  inserted, and the code is rescheduled. The empirical
		  analysis considers machines with up to 8 functional
		  units, 16 read ports and 4 partitions. While
		  partitioning hurts when additional read ports are
		  free, it increases performance when the cycle time
		  depends on the number of read ports logarithmically."
}

@Proceedings{micro25,
  key =		"MICRO-25",
  title =	"25th Annual International Symposium on Microarchitecture (MICRO-25)",
  booktitle =	"25th Annual International Symposium on Microarchitecture (MICRO-25)",
  year = 	"1992"
}

@InProceedings{rather+93,
  author = 	 "Elizabeth D. Rather and Donald R. Colburn and
		  Charles H. Moore",
  title = 	 "The Evolution of {Forth}",
  crossref =	 "hopl2preprints",
  pages = 	 "177--199"
}

@InProceedings{stroustroup93,
  author =	 "Bjarne Stroustroup",
  title =	 "A History of C++: 1979--1991",
  crossref =	 "hopl2preprints",
  pages =	 "271--297",
  annote =	 "Contains a very interesting observation about the
                  interaction of typechecking and programmer behaviour
                  (Section 15.2.4.3): ``As programmers learned C with
                  Classes or C++, they lost the ability to quickly
                  find the ``silly errors'' that creep into C programs
                  through the lack of checking. Further, they failed
                  to take the precautions against such silly errors
                  that good C programmers take as a matter of
                  course. After all, ``such errors don't happen in C
                  with Classes.'' Thus, as the frequency of run-time
                  errors caused by uncaught argument type errors goes
                  down, their seriousness and the time needed to find
                  them goes up.''"
}

@InProceedings{rather+96,
  author = 	 "Elizabeth D. Rather and Donald R. Colburn and
		  Charles H. Moore",
  title = 	 "The Evolution of {Forth}",
  crossref =	 "hopl2",
  pages = 	 "625--658",
  url =		 "http://www.forth.com/Content/History/History1.htm"
}

@InProceedings{kay96,
  author = 	 "Alan C. Kay",
  title = 	 "The Early History of Smalltalk",
  crossref =	 "hopl2",
  pages = 	 "511--579"
}

@InProceedings{steele&gabriel96,
  author = 	 "Guy L. Steele and Richard P. Gabriel",
  title = 	 "The Evolution of {Lisp}",
  crossref =	 "hopl2",
  pages = 	 "233--309"
}

@Proceedings{hopl2preprints,
  title = 	 "History of Programming Languages (HOPL-II) Preprints",
  booktitle = 	 "History of Programming Languages (HOPL-II) Preprints",
  year = 	 "1993",
  key = 	 "HOPL-II",
  note = 	 "SIGPLAN Notices 28(3)"
}

@Proceedings{hopl2,
  title = 	 {History of Programming Languages},
  booktitle = 	 {History of Programming Languages},
  year = 	 1996,
  key =		 {HOPL-II},
  publisher =	 {ACM Press/Addison-Wesley}
}

@Book{levine+92,
  author = 	 "John R. Levine and Tony Mason and Doug Brown",
  title = 	 "lex&yacc",
  publisher = 	 "O'Reilly & Associates",
  year = 	 "1992",
  edition = 	 "second",
  annote = 	 "Discusses all aspects of using lex and yacc in
		  practice."
}

@InProceedings{paysan93,
  author = 	 "Bernd Paysan",
  title = 	 "{ANS fig/GNU/??? Forth}",
  booktitle = "{Forth-Tagung}",
  year = 	 "1993"
}

@TechReport{keppel+93,
  author = 	 "David Keppel and Susan J. Eggers and Robert R. Henry",
  title = 	 "A Case for Runtime Code Generation",
  institution =  "Dept. of CS&E, U. of Washington, Seattle",
  year = 	 "1991",
  number = 	 "91-11-04"
}

@TechReport{sidebottom93,
  author = 	 "Greg Sidebottom",
  title = 	 "Compiling Constraint Logic Programming using
		  Interval Computations and Branching Constructs",
  institution =  "Simon Fraser University",
  year = 	 "1993",
  OPTnumber = 	 ""
}

@InProceedings{rau&glaeser81,
  author = 	 "B. R. Rau and C. D. Glaeser",
  title = 	 "Some Scheduling Techgniques and an Easily
		  Schedulable Horizontal Architecture for High
		  Performance Scientific Computing",
  crossref =  "micro14",
  pages = 	 "183--198",
  annote = 	 "In contrast to other seminal papers this is not hard
		  to read. It introduces modulo scheduling by first
		  explaining DAG scheduling and vector loop
		  scheduling. The second part of the paper discusses
		  architectural support for schedulability. They
		  propose delay elements (latches) at the ALU cross-bar
		  meeting points. These delay elements and the
		  register file should be organised as FIFOs to
		  achieve register renaming."
}

@Proceedings{micro14,
  key =		"MICRO-14",
  title =	"14th Annual Microprogramming Workshop (MICRO-14)",
  booktitle =	"14th Annual Microprogramming Workshop (MICRO-14)",
  year = 	"1981"
}

@InProceedings{bourdoncle93,
  author = 	 "Fran{\c c}ois Bourdoncle",
  title = 	 "Abstract Debugging of Higher-Order Imperative Languages",
  crossref =	 "sigplan93",
  pages = 	 "46--55",
  annote = 	 "Programs are annotated with ``always'' and
		  ``eventually'' assertions. The debugger then tries
		  to prove these assertions by abstract
		  interpretation. A system was implemented for a
		  subset of Pascal. The main problem with this approach
		  seems to be that imprecise information (e.g. from
		  aliases) will result in lots of warnings. Some
		  solutions for this are proposed (e.g.
		  pass-in/pass-out instead of reference parameters)."
}

@InProceedings{adams+93,
  author = 	 "Norman Adams and Pavel Curtis and Mike Spreitzer",
  title = 	 "First-class Data-type Representations in {\sc SchemeXerox}",
  crossref =	 "sigplan93",
  pages = 	 "139--146",
  annote = 	 "Extends Scheme with routines, that can define types
		  as basic as lists. To make the whole thing
		  efficient, the optimizer includes special
		  simplifying transformations."
}

@InProceedings{grunwald+93,
  author = 	 "Dirk Grunwald and Benjamin Zorn and Robert Henderson",
  title = 	 "Improving the Cache Locality of Memory Allocation",
  crossref =	 "sigplan93",
  pages = 	 "177--186",
  annote = 	 "Compares several C memory allocation schemes wrt
		  Paging and Cache behaviour. The best algorithms are
		  also the fastest CPU-wise: BSD and QuickFit."
}

@InProceedings{barret&zorn93,
  author = 	 "David A. Barret and Benjamin G. Zorn",
  title = 	 "Using Lifetime Predictors to Improve Memory
		  Allocation Performance",
  crossref =	 "sigplan93",
  pages = 	 "187--196",
  annote = 	 "The lifetime of dynamically allocated memory is
		  predicted using a training run. It is predicted on a
		  per-call-chain basis; the time unit is ``bytes
		  allocated during the life time''. This information
		  can be used to build more efficient allocators."
}

@InProceedings{boehm93,
  author = 	 "Hans-Juergen Boehm",
  title = 	 "Space Efficient Conservative Garbage Collection",
  crossref =	 "sigplan93",
  pages = 	 "197--206",
  annote = 	 "Discusses ways to reduce the problems of
		  conservative garbage collectors: Blacklisting (not
		  allocating memory from certain addresses) reduces
		  the number of pointer misidentifications (especially
		  those due to constant static data). Clearing
		  dead areas of the stack avoids lengthening many
		  lifetimes. The paper also recomends thinking about
		  garbage collection when designing data structures,
		  i.e., not introducing more connectivity than used by
		  the program."
}

@InProceedings{flanagan+93,
  author = 	 "Cormac Flanagan and Amr Sabry and Bruce F. Duba and
		  Matthias Felleisen",
  title = 	 "The Essence of Compiling with Continuations",
  crossref =	 "sigplan93",
  pages = 	 "237--247",
  annote = 	 "CPS compilers convert into CPS form, optimize the
		  program and convert back. Isomorphic optimizations can
		  be performed on the original program, saving the
		  transformation to and from CPS form."
}

@InProceedings{pinter93,
  author = 	 "Shlomit S. Pinter",
  title = 	 "Register Allocation with Instruction Scheduling: A
		  New Approach",
  crossref =	 "sigplan93",
  pages = 	 "248--257",
  annote = 	 "Performs postpass scheduling with a modified
		  register allocator: The conflict graph of the
		  register allocator contains all potential conflict
		  edges, i.e. more than a prepass approach. The paper
		  gives a few heuristics for removing edges if
		  registers become a problem, but no results."
}

@InProceedings{huff93,
  author = 	 "Richard A. Huff",
  title = 	 "Lifetime-Sensitive Modulo Scheduling",
  crossref =	 "sigplan93",
  pages = 	 "258--267",
  annote = 	 "Modifies Modulo Scheduling to use an
		  operation-driven scheduling strategy and gives
		  heuristics that minimize register pressure. The
		  algorithm produces tighter loops than Cydromes
		  scheduler and less register pressure. 93\% of the
		  measured loops are within 10 registers of the ideal."
}

@InProceedings{kolte&harrold93,
  author = 	 "Priyadarshan Kolte and Mary Jean Harrold",
  title = 	 "Load/Store Range Analysis for Global Register Allocation",
  crossref =	 "sigplan93",
  pages = 	 "268--277",
  annote = 	 "Load and Store ranges are subranges of live ranges
		  that have separate costs and can be used for
		  register allocation instead of live ranges. The idea
		  was tested by compiling several small programs for
		  processors with tiny register sets (1--8 registers).
		  The spill code for the 8-register machine is reduced
		  by -34\%--52\%."
}

@InProceedings{kerns&eggers93,
  author = 	 "Daniel R. Kerns and Susan J. Eggers",
  title = 	 "Balanced Scheduling: Instruction Scheduling when
		  Memory Latency is Uncertain",
  crossref =	 "sigplan93",
  pages = 	 "278--289",
  annote = 	 "Instructions that can be executed in parallel with a
		  chain of loads are distributed evenly across the
		  loads, in order to achieve better behaviour on
		  cache misses etc. Balanced Scheduling offers an
		  speedup of 6\%--8\% on a machine, where all
		  latencies (except load) are one, with the Perfect
		  benchmarks as workload."
}

@InProceedings{warter+93,
  author = 	 "Nancy J. Warter and Scott A. Mahlke and {Wen-mei} W.
		  Hwu and B. Ramakrishna Rau",
  title = 	 "Reverse If-Conversion",
  crossref =	 "sigplan93",
  pages = 	 "290--299"
}

@InProceedings{ball&larus93,
  author = 	 "Thomas Ball and James R. Larus",
  title = 	 "Branch Prediction for Free",
  crossref =	 "sigplan93",
  pages = 	 "300--313",
  annote = 	 "Branch prediction without profiling: For loop
		  branches predicting looping gives good results. This
		  paper points out that many loop branches are not
		  backward branches, and that control flow analysis is
		  needed to recognize them. It also presents several
		  heuristics for predicting non-loop branches, based
		  on the branch opcode, properties of the successor
		  blocks (whether a loop, call, return, use or store
		  is in one of the successors), or whether the loop is
		  based on a pointer comparison. These heuristics work
		  have miss rates of (16--45\%), when they can be
		  applied. They can be combined into a heuristic with
		  26\% miss rate that can be appied to 79\%
		  (5-100\%) of the non-loop branches, resulting in
		  overall miss rates of 19\% (1--41\%). Finally they
		  grind an axe with the instructions per mispredicted
		  branch metric \cite{fisher&freudenberger92}, but
		  this section is not very clear."
}

@Proceedings{sigplan93,
  booktitle = 	 "SIGPLAN '93 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '93 Conference on Programming Language
		  Design and Implementation",
  year = 	 "1993",
  key = 	 "SIGPLAN '93",
  note = 	 "SIGPLAN Notices 28(6)"
}


@Unpublished{vanhentenryck+91,
  author = 	 "Van Hentenryck, Pascal and Vijay Saraswat and Yves Deville",
  title = 	 "Constraint Processing in {{\tt cc({FD})}}",
  url = 	 "ftp://parcftp.xerox.com/pub/ccp/ccfd/pldi-5.ps",
  year =	 "1991",
  annote =	 "A new basis for the finite domain part of CHIP:
		  The basic system only contains domain variables;
		  There are four ways to construct constraints:
		  Indexical constraints (somewhat similar to Nicologs
		  projection language), the cardinality operator,
		  constructive disjunction, and extended asks."
}

@InCollection{ertl&krall95,
  author = 	 {M. Anton Ertl and Andreas Krall},
  title = 	 {High Level Constraints over Finite Domains},
  booktitle = 	 {Constraint Processing},
  publisher =	 {Springer LNCS~923},
  year =	 {1995},
  editor =	 {Manfred Meyer},
  pages =	 {51--66}
}

@Article{vanroy94,
  author = 	 {Van Roy, Peter},
  title = 	 {1983--1993: The Wonder Years of Sequential Prolog Implementation},
  journal = 	 {Journal of Logic Programming},
  year = 	 1994,
  volume =	 {19,20},
  pages =	 {385--441},
  url =		 {ftp://ftp.digital.com/pub/DEC/PRL/research-reports/PRL-RR-36.ps.Z},
  url =		 {http://www.ps.uni-sb.de/Papers/abstracts/SequentialPrologImp.html}
}

@Proceedings{plsa94,
  title = 	 "Programming Languages and System Architectures",
  booktitle = 	 "Programming Languages and System Architectures",
  year = 	 "1994",
  key =		 "PLSA",
  editor =	 "J{\"u}rg Gutknecht",
  publisher =	 "Springer LNCS~782",
  address =	 "{Z\"urich}"
}

@Article{peytonjones+93,
  author = 	 "Simon L. {Peyton Jones} and John Hughes and John Launchbury",
  title = 	 "How to Give a Good Research Talk",
  journal =	 sigplan,
  year =	 "1993",
  volume =	 "28",
  number =	 "11",
  pages =	 "9--12",
  month =	 nov
}

@InProceedings{hoffmann93,
  author = 	 "Ulrich Hoffmann",
  title = 	 "Static Stack Effect Analysis",
  booktitle =	 "EuroFORTH '93 conference proceedings",
  year =	 "1993",
  address =	 "Mari\'ansk\'e L\'azn\`e (Marienbad)"
}

@Article{silberman&ebcioglu93,
  author = 	 "Gabriel M Silberman and Kemal Ebcio\u{g}lu",
  title = 	 "An Architectural Framework for Supporting
		  Heterogeneous Instruction-Set Architectures",
  journal =	 ieeecomputer,
  year =	 "1993",
  pages =	 "39--56",
  month =	 jun,
  annote = 	 "Proposes mechanisms for executing old binaries on
		  new, high performance machines. Includes an
		  interesting section on exception handling."
}

@InProceedings{bringman+93,
  author = 	 "Roger A. Bringman and Scott A. Mahlke and Richard E.
		  Hank and John C. Gyllenhaal and {Wen-mei} W. Hwu",
  title = 	 "Speculative Execution Exception Recovery Using
		  Write-Back Suppression",
  crossref =	 "micro26",
  pages =	 "214--223",
  annote = 	 "A hardware machanism for the problem of
		  speculatively executing potentially excepting
		  instructions. It presumes a superblock scheduling
		  framework. After the exception has happened, the
		  writeback of that instruction and all subsequent
		  instructions whose speculation depth is at least as
		  great as that of the excepting instruction are
		  suppressed. After detecting that the exception is
		  relevant, the hardware reexecutes the instructions
		  whose writeback was suppressed. Exceptions from
		  instructions from different basic blocks complicate
		  matters a little more."
}

@Proceedings{micro26,
  key =		"MICRO-26",
  title =	"26th Annual International Symposium on Microarchitecture (MICRO-26)",
  booktitle =	"26th Annual International Symposium on Microarchitecture (MICRO-26)",
  year = 	"1993"
}

@Proceedings{cc94,
  title = 	 "Compiler Construction (CC '94)",
  booktitle = 	 "Compiler Construction (CC '94)",
  year = 	 "1994",
  key =		 "CC '94",
  publisher =	 "Springer LNCS~786",
  address =	 "Edinburgh",
  month =	 "April",
}

@Manual{dpans6-93,
  title = 	 "Draft proposed American National Standard --- Forth
		  (X3J14 dpANS6)",
  year =	 "1993",
  key =		 "ANS~Forth"
}

@Article{hayes92,
  author = 	 "John R. Hayes",
  title = 	 "User-Defined Local Variable Syntax with {ANS Forth}",
  journal =	 sigforth,
  year =	 "1992",
  volume =	 "4",
  number =	 "2",
  OPTpages =	 "19, 20, 26",
  annote =	 "Shows how to define a nice locals syntax using the ANS
		  Forth locals wordset."
}

@InProceedings{diaz&codognet93,
  author = 	 "Daniel Diaz and Phillippe Codognet",
  title = 	 "A Minimal Extension of the {WAM} for {\tt clp(FD)}",
  pages =	 "774--790",
  booktitle =	 "International Conference on Logic Programming (ICLP)",
  year =	 1993
}

@InProceedings{jourdan&sola93,
  author = 	 "Jean Jourdan and Thierry Sola",
  title = 	 "The Versatility of Handling Disjunctions as Constraints",
  pages =	 "60--74",
  booktitle =	 "Programming Language Implementation and Logic
		  Programming (PLILP)",
  year =	 1993
}

@InCollection{codognet+93,
  author = 	 "Phillippe Codognet and Fran{\c{c}}ois Fages and Thierry Sola",
  title = 	 "A Metalevel Compiler of {CLP(FD)} and Its
		  Combination with Intelligent Backtracking",
  booktitle =	 "Constraint Logic Programming: Selected Research",
  publisher =	 "MIT Press",
  year =	 1993,
  editor =	 "Fr{\'e}d{\'e}ric Benhamou and Alain Colmerauer",
  pages =	 "437--456"
}

@Article{klint81,
  author = 	 "Paul Klint",
  title = 	 "Interpretation Techniques",
  journal =	 spe,
  year =	 1981,
  volume =	 11,
  pages =	 "963--973",
  annote =	 "General discussion of interpreters. Empirical
		  comparison of direct threading, indirect threading
		  and token threading on PDP-11 and CYBER-73."
}

@Book{krasner83,
  title = 	 "Smalltalk-80: Bits of History, Words of Advice",
  publisher = 	 "Addison-Wesley",
  year = 	 1983,
  editor =	 "Glen Krasner"
}

@Book{debaere&vancampenhout90,
  author = 	 "Eddy H. Debaere and Jan M. {Van Campenhout}",
  title = 	 "Interpretation and Instruction Path Coprocessing",
  publisher = 	 "The MIT Press",
  year = 	 1990,
  annote =	 "Good discussion about interpretation with big
		  bibliography. They propose instruction path
		  coprocessing as a means to speed up interpreters. An
		  instruction path coprocessor is similar to a
		  microcode sequencer that has the code to be
		  interpreted as machine code and the machine code of
		  the main processor as microcode."
}

@InProceedings{pittman87,
  author = 	 "Thomas Pittman",
  title = 	 "Two-Level Hybrid Interpreter/Native Code Execution
		  for Combined Space-Time Efficiency",
  crossref =	 "sigplan87",
  pages =	 "150--152",
  annote =	 "Proposes the typical Forth way of speedup in
		  Interpreters: Coding time-critical stuff in assembly
		  language."
}

@Proceedings{sigplan87,
  title = 	 "Symposium on Interpreters and Interpretive
		  Techniques (SIGPLAN '87)",
  booktitle = 	 "Symposium on Interpreters and Interpretive
		  Techniques (SIGPLAN '87)",
  year = 	 1987,
  key =		 "SIGPLAN '87"
}

@Article{bell73,
  author = 	 "James R. Bell",
  title = 	 "Threaded Code",
  journal =	 cacm,
  year =	 1973,
  volume =	 16,
  number =	 6,
  pages =	 "370--372"
}

@InCollection{brandis95,
  author = 	 "Marc Brandis",
  title = 	 "Register allocation using graph coloring",
  crossref =	 "comp.compilers",
  year =	 "1995",
  volume =	 "95-2",
  annote =	 "Discusses some papers on the topic. In Particular
		  that he has implemented the algorithm of
		  \cite{callahan&koblenz91}."
}

@Misc{comp.compilers,
  key =		 "{\tt c.c}",
  title =	 "{\tt comp.compilers}",
  booktitle =	 "{\tt comp.compilers}",
  howpublished = "Usenet Newsgroup; archives available from http://www.iecc.com/compilers/"
}

@InProceedings{poeial94,
  author = 	 "Jaanus P{\"o}ial",
  title = 	 "Forth and Formal Language Theory",
  crossref =	 "euroforth94",
  pages =	 "47--52",
  annote =	 "Shows that the stack effect notation is at least as
		  powerful as context-free grammars and that it is better
		  suited for specifying the syntax of Forth."
}

@Proceedings{euroforth94,
  title = 	 "EuroForth~'94 Conference Proceedings",
  booktitle = 	 "EuroForth~'94 Conference Proceedings",
  year = 	 "1994",
  key =		 "EuroForth '94",
  address =	 "Winchester, UK",
}

@Article{golberg91,
  author = 	 "David Goldberg",
  title = 	 "What Every Computer Scientist Should Know About
		  Floating-Point Arithmetic",
  journal =	 acmcs,
  year =	 "1991",
  volume =	 "23",
  number =	 "1",
  pages =	 "5--48"
}

@InProceedings{burger+95,
  author = 	 {Robert G. Burger and Oscar Waddell and R. Kent Dybvig},
  title = 	 {Register Allocation Using Lazy Saves, Eager
                  Restores, and Greedy Shuffling},
  crossref =	 "sigplan95",
  pages =	 {130--138}
}

@Proceedings{sigplan95,
  booktitle = 	 "SIGPLAN '95 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '95 Conference on Programming Language
		  Design and Implementation",
  year = 	 "1995",
  key = 	 "SIGPLAN '95"
}

@Article{blake77,
  author = 	 "Russell P. Blake",
  title = 	 "Exploring a Stack Architecture",
  journal =	 ieeecomputer,
  year =	 "1977",
  volume =	 "10",
  number =	 "5",
  pages =	 "30--39",
  month =	 may
}

@InProceedings{ditzel&mclellan82,
  author = 	 "David R. Ditzel and H. R. McLellan",
  title = 	 "Register Allocation for Free: The {C} machine Stack Cache",
  pages =	 "48--56",
  booktitle =	 "Symposium on Architectural Support for Programming
		  Languages and Systems",
  year =	 "1982"
}

@InProceedings{gallagher+94,
  author = 	 "David M. Gallagher and William Y. Chen and Scott
		  A. Mahlke and John C. Gyllenhaal and {Wen-mei} W. Hwu",
  title = 	 "Dynamic Memory Disambiguation Using the Memory
		  Conflict Buffer",
  crossref =	 "asplos94",
  pages =	 "183--193",
  annote =	 "Hardware support for run-time disambiguation. Loads
		  that are moved in front of stores (that might access
		  the same memory) become {\em preloads}. At the
		  original location of the load a {\em check}
		  instruction is inserted. If there was a conflict,
		  the check branches to compiler-generated recovery
		  code. The memory conflict buffer that supports this
		  architecture is a cache-like structure that
		  associates the destination register of a preload
		  with the accessed address. Stores to the same
		  address (and sometimes other stores and loads) set
		  the conflict bit that makes the check branch to the
		  recovery code. This structure detects all conflicts,
		  but also gives false alarms sometimes. The number of
		  false alarms can be kept low by setting the
		  parameters (number of entries, associativity, ...)
		  to appropriate values. The speedups are quite
		  spectacular (more than 2.5 for alvinn and cmp on an 8-issue
		  machine)."
}


@InProceedings{young&smith94,
  author = 	 "Cliff Young and Michael D. Smith",
  title = 	 "Improving the Accuracy of Static Branch Prediction
		  Using Branch Correlation",
  crossref =	 "asplos94",
  pages =	 "232--241",
  annote =	 "This profiling-based method is quite different from
		  the hardware schemes, because it differentiates
		  between the paths on which a branch is executed. If
		  the branch predictions along different paths are
		  different, the branch (and part of the paths to it)
		  is replicated. The improvements in prediction
		  accuracy seem to be in the same league as Krall's
		  results."
}

@InProceedings{calder&grunwald94,
  author = 	 "Brad Calder and Dirk Grunwald",
  title = 	 "Reducing Branch Costs via Branch Alignment",
  crossref =	 "asplos94",
  pages =	 "242--251",
  annote =	 "Branch alignment is the rearrangement of the basic
		  block such that the costs of conditional branches
		  are minimized. The best earlier algorithm greedily
		  reduces the cost of one control flow edge at a time,
		  starting with the most-executed edge. The algorithm
		  of this paper exhaustively tries all possibilities for
		  15 edges at a time."
}

@InProceedings{engler&proebsting94,
  author = 	 "Dawson R. Engler and Todd A. Proebsting",
  title = 	 "DCG: An Efficient, Retargetable Dynamic Code
		  Generation System",
  crossref =	 "asplos94",
  pages =	 "263--271",
  annote =	 "Generates code for the MIPS and SPARC. The interface
		  used is {\tt lcc}'s code generation interface, so
		  dynamic code is generated a procedure at a time,
		  using C calling conventions. The code generator
		  takes about 350 instructions to generate one
		  instruction (about 100 times faster than gcc). This
		  paper also presents some interesting examples where
		  dynamic code generation provides great speedups over
		  other approaches to solve the same problem (e.g., a
		  general algorithm, or an interpreter)."
}

@Proceedings{asplos94,
  title = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-VI)",
  booktitle = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-VI)",
  year = 	 "1994",
  key =		 "ASPLOS-VI"
}

@Book{weiss&smith94,
  author = 	 "Shlomo Weiss and James E. Smith",
  title = 	 "Power and PowerPC",
  publisher = 	 "Morgan Kaufmann",
  year = 	 "1994",
  annote = 	 "Provides an overview of the Power and PowerPC
		  architectures and looks at the Power1, Power2 and
		  MPC~601 implementations. Finally, the authors look
		  at memory and I/O sybsystems of machines employing
		  these processors and compares the MPC~601 with the
		  21064 implementation of the Alpha architecture."
}

@InProceedings{wu&larus94,
  author = 	 "Youfeng Wu and James R. Larus",
  title = 	 "Static Branch Frequency and Program Profile Analysis",
  crossref =	 "micro94",
  pages =	 "1--11",
  annote =	 "The heuristics for branch prediction of
		  \cite{ball&larus93} are not only used for predicting
		  the branch direction, but also for predicting the
		  branch probability and, consequently, the
		  profile. If several heuristics predict a branch,
		  they are combined using a mathematical method called
		  the Dempster-Shafer theory of evidence. The results
		  are evaluated by comparing the top $n$\% of static
		  and dynamic profiles. The results are much better
		  than for the heuristics used in \cite{wall91pldi}."
}

@InProceedings{schlansker+94,
  author = 	 "Michael Schlansker and Vinod Kathail and Sadun Anik",
  title = 	 "Height Reduction of Control Recurrences for {ILP} Processors",
  crossref =	 "micro94",
  pages =	 "40--51",
  annote =	 "Height reduction is applied to recurrences on which
		  branches (in particular loop exit branches) depend."
}

@TechReport{schlansker&kathail93,
  author = 	 "Michael Schlansker and Vinod Kathail",
  title = 	 "Acceleration of Algebraic Recurrences on Processors
		  with Instruction Level Parallelism",
  institution =  "HP Laboratories",
  year = 	 "1993",
  type =	 "technical report",
  number =	 "HPL-93-55",
  note =	 "A shorter version appeared in \cite{bannerjee94}.",
  annote =	 "The associative and distributive laws are applied
		  to reduce recurrence (cyclic data flow paths)
		  heights in (DO) loops. The basic idea is to
		  replace some references to loop-variant variables
		  with the expression assigned to that variable. The
		  resulting big expressions can then be transformed to
		  minimize the critical path length, usually in a way
		  requiring more resources. This paper introduces
		  blocked back-substitution: The loop is unrolled
		  several times, and only the loop-carried copies of
		  the variables are back-substituted, the others are
		  computed using the slow, but resource-saving
		  method. The paper explains how to apply blocked
		  back-substitution to first-order and higher-order
		  recurrences and gives formulae for the resulting
		  recurrence path length and the needed resources. For
		  first-order recurrences blocked-back-substitution
		  works well, allowing the exploitation of unlimited
		  parallelism (assuming infinite loop trip counts)
		  while increasing the operation count just by a
		  constant factor."
}

@Proceedings{bannerjee94,
  title = 	 "Languages and Compilers for Parallel Computing",
  year = 	 "1994",
  editor =	 "Uptal Bannerjee"
}

@InProceedings{noonburg&shen94,
  author = 	 "Derek B. Noonburg and Jonathan P. Shen",
  title = 	 "Theoretical Modeling of Superscalar Processor Performance",
  crossref =	 "micro94",
  pages =	 "52--62"
}

@InProceedings{rau94,
  author = 	 "B. Ramakrishna Rau",
  title = 	 "Iterative Modulo Scheduling: An Algorithm for
		  Software Pipelining",
  crossref =	 "micro94",
  pages =	 "63--74",
  annote =	 "Analyses the compile-time of modulo scheduling, both
		  empirically and with respect to the computational
		  complexity. The paper also describes how to
		  implement modulo scheduling for fast
		  compilation. For the loops (Perfect Club, Spec,
		  Livermore Fortran Kernels) and the machine model (a
		  slightly modified Cydra~5) used in the paper, modulo
		  scheduling scheduled each instruction only 1.59
		  times."
}

@InProceedings{govindarajan+94,
  author = 	 "R. Govindarajan and Erik R. Altman and Guang R. Gao",
  title = 	 "Minimizing Register Requirements under
		  Resource-Constrained Rate-Optimal Software
		  Pipelining",
  crossref =	 "micro94",
  pages =	 "85--94",
  annote =	 "The problem is formalized as an optimization problem
		  and a search algorithm is used to solve it. The
		  results are compared with other algorithms. The
		  method employed is too slow for practical use, but
		  since it is optimal, it provides a good target for
		  other methods."
}

@InProceedings{golden&mudge94,
  author = 	 "Michael Golden and Trevor Mudge",
  title = 	 "A Comparison of Two Pipeline Organizations",
  crossref =	 "micro94",
  pages =	 "153--161",
  annote =	 "Compares Pipeline organizations where the ALU and
		  branch resolution are in
		  the first execute stage (i.e., load-use-delays) and
		  organizations where these functions are in the last
		  execute stage (i.e., address-generation-load delays
		  and higher branch miss penalties). Interestingly,
		  with good branch prediction the latter
		  organization performs better even on code that is
		  scheduled for the first organization. The explanation
		  given is that a computed register is often used in
		  several loads, and the delay is only incurred once,
		  if at all."
}

@InProceedings{razdan&smith94,
  author = 	 "Rahul Razdan and Michael D. Smith",
  title = 	 "A High-Performance Microarchitecture with
		  Hardware-Programmable Functional Units",
  crossref =	 "micro94",
  pages =	 "172--180",
  annote =	 "A RISC processor is extended with a programmable
		  functional unit (similar to an FPGA) that acts like
		  other units, e.g., the ALU. The compiler
		  automatically extracts stuff from the source program
		  that can be accelerated with the PFU and translates
		  it into hardware that can be accessed via PFU
		  instructions. This approach results in speedups of
		  1.06--1.91 (1.06--1.16 without eqntott) for the
		  SpecInt92 benchmarks."
}

@InProceedings{hoogerbrugge&corporaal94,
  author = 	 "Jan Hoogerbrugge and Henk Corporaal",
  title = 	 "Register File Port Requirements of Transport
		  Triggered Architectures",
  crossref =	 "micro94",
  pages =	 "191--195",
  annote =	 "The rationale for transport triggered architectures
		  is that the compiler will allocate resources like
		  ports and busses better than is usually done by
		  hardware. This paper supports this claim empirically
		  for register ports. With only one read and one
		  write port to the register file they achieve 1.98
		  operations per cycle (however, FUs and buses are
		  present in abundance). With 3 read and write ports
		  3.63 operations/cycle are achieved, pretty close to
		  the maximum of 3.8 with 6 read and write
		  ports. Overall, transport triggered architectures
		  need 0.5 read and 0.35 write ports per operation,
		  compared to 2 read and 1 write port on a conventional
		  architecture."
}

@Proceedings{micro94,
  title = 	 "International Symposium on Microarchitecture (MICRO-27)",
  booktitle = 	 "International Symposium on Microarchitecture (MICRO-27)",
  year = 	 "1994",
  key =		 "MICRO-27"
}

@InProceedings{bailey&sotudeh93,
  author = 	 "C. Bailey and R. Sotudeh",
  title = 	 "Quantitative Assessment of Machine-Stack Behaviour
		  for Better Computer Performance",
  booktitle =	 "Proceedings of the 9th international Conference on
		  Mathematical and Computer Modelling",
  year =	 "1993",
  annote =	 "Presents some quantitative evidence on the stack
		  usage behaviour of Forth programs, unfortunately
		  only based on four small benchmarks."
}

@Article{koopman94,
  author = 	 "Philip Koopman, Jr.",
  title = 	 "A Preliminary Exploration of Optimized Stack Code
		  Generation",
  journal =	 jfar,
  year =	 "1994",
  volume =	 "6",
  number =	 "3",
  pages =	 "241--251",
  url =		 "http://www.cs.cmu.edu/~koopman/stack_compiler/index.html",
  annote =	 "This paper attacks the problem of optimizing code
		  with local variables for stack machines. It starts
		  with gcc's intermediate code and turns every
		  pseudoregister reference into a local variable
		  reference. Then it tries to convert local variable
		  accesses to stack accesses. First the distances
		  between accesses to the same variable are
		  determined, then the values are allocated to the
		  stack starting with the shortest distances. This
		  algorithm is used for basic blocks and results in
		  removing 91\%--100\% of the redundant local variable
		  accesses. For global stack scheduling, a few
		  experiences from manual optimization are reported."
}

@Article{hayes&lee89,
  author = 	 "John Hayes and Susan Lee",
  title = 	 "The architecture of the {SC32} {Forth} Engine",
  journal =	 jfar,
  year =	 "1989",
  volume =	 "5",
  number =	 "4",
  pages =	 "493--506",
  annote =	 "Describes the SC32 (aka FRISC3) processor in
		  detail. In contrast to earlier FRISC designs, it has
		  better support for loads, stores and literals and
		  handles stack buffer overflow with hardware,
		  reading/writing one cell at a time."
}

@Article{chan+94,
  author = 	 "Yin Chan and Ashok Sundarsanam and Andrew Wolfe",
  title = 	 "The Effect of Compiler-Flag Tuning on SPEC Benchmark Performance",
  journal =	 can,
  year =	 "1994",
  volume =	 "22",
  number =	 "4",
  pages =	 "60--70",
  annote =	 "Discusses the ways in which SPEC measurements are made
		  and their problems. In spite of investing great
		  effort, the authors were not able to recreate all
		  the results reported by the manufacturer. With more
		  realistic compiler flags, they achieved 78\%--95\%
		  of the manufacturer-reported performance. Further
		  significant reductions in performance were measured under the
		  realistic assumptions of using centralized file
		  servers and producing code for older
		  versions of the architecture."
}

@Article{charlesworth81,
  author = 	 "Alan E. Charlesworth",
  title = 	 "An Approach to Scientific Array Processing: The
		  Architectural Design of the AP-120B/FPS-164 Family",
  journal =	 ieeecomputer,
  year =	 "1981",
  pages =	 "18--27",
  month =	 sep,
  annote =	 "Describes the architecture of two early LIW
		  machines designed for scientific and signal
		  processing. It could start one FP addition, one FP
		  multiplication, one load  from or store to each of the
		  two memories and one read from and one write to each
		  of the two register files, one address computation
		  and one branch per cycle. The functional units were
		  fully pipelined with relatively short pipelines for
		  good scalar performance. Inner loops of applications
		  are optimized using software pipelining, which is
		  demonstrated using the dot product, the other code
		  uses basic-block scheduling."
}

@Article{pelli87,
  author = 	 "Deni G. Pelli",
  title = 	 "Programming in Postscript",
  journal =	 "Byte",
  year =	 "1987",
  volume =	 "12",
  number =	 "5",
  pages =	 "185--202",
  month =	 may,
  annote =	 "Describes what you can do on Laser printers and
		  typesetting machines by programming Postscript
		  directly."
}

@Article{seybold86,
  key =		 "Seybold",
  title = 	 "PostScript: Can It Cut the Mustard?",
  journal =	 "The Seybold Report on Publishing Systems",
  year =	 "1986",
  volume =	 "15",
  number =	 "12"
}

@InProceedings{dean&chambers94,
  author = 	 "Jeffrey Dean and Craig Chambers",
  title = 	 "Towards Better Inlining Decisions Using Inlining Trials",
  pages =	 "273--282",
  booktitle =	 "Conference on Lisp and Functional Programming",
  year =	 "1994",
  annote =	 "Inlining decisions are based on actually trying and
		  evaluating the inlining. The results are stored in a
		  database that is used across compilations and across
		  programs. The database is indexed with information
		  derived from type group analysis, in order to make
		  the result of one trial as widely applicable as
		  possible."
}

@Article{rau&fisher93,
  author = 	 "B. Ramakrishna Rau and Joseph A. Fisher",
  title = 	 "Instruction-Level Parallel Processing: History,
		  Overview, and Perspective",
  journal =	 "Journal of Supercomputing",
  year =	 "1993",
  volume =	 "7",
  number =	 "1/2",
  pages =	 "9-50",
  note =	 "Reprinted in \cite{rau&fisher93book}",
  annote =	 "Gives an overview of historical developments in the
		  area, concentrating on the VLIW architectures and
		  the compilation techniques developped for them in
		  the 1980s. Contains an extensive bibliography."
}

@Article{lowney+93,
  author = 	 "P. Geoffrey Lowney and Stefan M. Freudenberger and
		  Thomas J. Karzes and W. D. Liechtenstein and Robert
		  P. Nix and John S. O'Donnel and John C. Ruttenberg",
  title = 	 "The {Multiflow} Trace Scheduling Compiler",
  journal =	 "Journal of Supercomputing",
  year =	 "1993",
  volume =	 "7",
  number =	 "1/2",
  pages =	 "51--142",
  note =	 "Reprinted in \cite{rau&fisher93book}",
  annote =	 "Much of what is presented here has been described
		  already in some form in \cite{ellis85}. One
		  advantage of the Multiflow compiler over Bulldog is
		  that it does not duplicate code when moving it up
		  across an IF-statement. Another original
		  contribution of this paper is a comparison of the
		  Multiflow Trace 14/300 with contemporary
		  competitors, the Convex C210 and the MIPS
		  M/1000. Not surprisingly, the Multiflow outperforms
		  its competitors on scalar FP code, the Convex is a
		  little better on vector code, and the MIPS is a
		  little better on scalar integer code. The paper also
		  contains an evaluation of the compiler, in
		  particular of compilation speed."
}

@Article{beck+93,
  author = 	 "Gary R. Beck and David W.L. Yen and Thomas L. Anderson",
  title = 	 "The {Cydra~5} Minisupercomputer: Architecture and Implementation",
  journal =	 "Journal of Supercomputing",
  year =	 "1993",
  volume =	 "7",
  number =	 "1/2",
  pages =	 "143--180",
  note =	 "Reprinted in \cite{rau&fisher93book}"
}

@Article{dehnert&towle93,
  author = 	 "James E. Dehnert and Ross A. Towle",
  title = 	 "Compiling for the {Cydra~5}",
  journal =	 "Journal of Supercomputing",
  year =	 "1993",
  volume =	 "7",
  number =	 "1/2",
  pages =	 "180--227",
  note =	 "Reprinted in \cite{rau&fisher93book}",
  annote =	 "This compiler employs software pipelining for
		  exploiting instruction-level parallelism."
}

@Article{hwu+93,
  author = 	 "{Wen-mei} W. Hwu and others",
  title = 	 "The Superblock: An Effective Technique for {VLIW} and
		  Superscalar Compilation",
  journal =	 "Journal of Supercomputing",
  year =	 "1993",
  volume =	 "7",
  number =	 "1/2",
  pages =	 "229--248",
  note =	 "Reprinted in \cite{rau&fisher93book}"
}

@Article{schuette&shen93,
  author = 	 "Michael A. Schuette and John P. Shen",
  title = 	 "Instruction-Level Experimental Evaluation of the
		  {Multiflow TRACE 14/300 VLIW} Computer",
  journal =	 "Journal of Supercomputing",
  year =	 "1993",
  volume =	 "7",
  number =	 "1/2",
  pages =	 "249--271",
  note =	 "Reprinted in \cite{rau&fisher93book}"
}

@Book{rau&fisher93book,
  title = 	 "Instruction-level parallelism",
  publisher = 	 "Kluwer Academic Publishers",
  year = 	 "1993",
  editor =	 "B. Ramakrishna Rau and Joseph A. Fisher",
  note =	 "Reprint of The Journal of Supercomputing, 7(1/2)"
}

@Article{click&cooper95,
  author = 	 {Cliff Click and Keith D. Cooper},
  title = 	 {Combining Analyses, Combining Optimizations},
  journal = 	 toplas,
  year = 	 {1995},
  volume =	 {17},
  number =	 {2},
  pages =	 {181--196},
  annote =	 {A theoretical paper on combining various
		  optimizations in one phase, in order to eliminate
		  phase-ordering problems and get better code.}
}

@InProceedings{baden90a,
  author = 	 "Wil Baden",
  title = 	 "Virtual Rheology",
  booktitle =	 "FORML'90 Proceedings",
  year =	 "1990",
  annote =	 "Explains how to construct any control flow graph
		  using Forth's control-flow words."
}

@InProceedings{baden90b,
  author = 	 "Wil Baden",
  title = 	 "How Many Forks for Deep Spaghetti",
  booktitle =	 "FORML'90 Proceedings",
  year =	 "1990",
  annote =	 "Shows that two branches are sufficient to produce
		  unstructured code and enumerates the control
		  structures that can be created using two branches."
}

@InProceedings{baden90c,
  author = 	 "Wil Baden",
  title = 	 "How to Uncook Spaghetti",
  booktitle =	 "FORML'90 Proceedings",
  year =	 "1990",
  annote =	 "Unstructured constructs can be converted to
		  structured code using
		  either code duplication or compulsion (introducing
		  new tests). The paper sketches a method for doing
		  it. The author concludes: ``In general the cure
		  isn't noticably better than the disease''."
}

@InProceedings{baden90d,
  author = 	 "Wil Baden",
  title = 	 "Spaghetti Restructured",
  booktitle =	 "FORML'90 Proceedings",
  year =	 "1990",
  annote =	 "Exemplifies the elimination of unstructured code
		  using Flynn's problem."
}

@InProceedings{proebsting95,
  author = 	 "Todd A. Proebsting",
  title = 	 "Optimizing an {ANSI~C} Interpreter with Superoperators",
  crossref =	 "popl95",
  pages =	 "322--332",
  annote =	 "Interpreter performance is optimized by combining
		  operators during code generation, when they are
		  still organized as trees. So a different, optimized
		  interpreter
		  is used for each program. Speedups of 1.8--3.1 are
		  achieved, but this is probably strongly dependent on
		  the original set of operators. The paper uses lccs
		  intermediate code operators \cite{fraser&hanson91a}."
}

@Proceedings{popl95,
  booktitle = 	 "Principles of Programming Languages (POPL '95)",
  title = 	 "Principles of Programming Languages (POPL '95)",
  year = 	 "1995",
  key =		 "POPL '95"
}

@Misc{beuster95,
  author =	 "Bernd Beuster",
  howpublished = "Usenet posting in de.comp.lang.forth",
  year =	 "1995",
  month =	 may,
  annote =	 "A hand-tuned indirect threaded Forth interpreter
		  needs 2.7s for the Siev benchmark on a 486DX2/66."
}

@Book{smith92until,
  author = 	 "Norman Smith",
  title = 	 "Write Your Own Programming Language Using {C++}",
  publisher = 	 "Wordware Publishing",
  year = 	 "1992",
  note =	 "ISBN 1-55622-264-5"
}

@InProceedings{norris&pollock93,
  author = 	 "Cindy Norris and Lori. L. Pollock",
  title = 	 "A Scheduler-Sensitive Global Register Allocator",
  booktitle =    "Supercomputing'93",
  year =         "1993",
  url =          "ftp://www.eecis.udel.edu/pub/people/pollock/SSG.ps",
  annote =	 "Like \cite{pinter93}, their global register
		  allocator builds a maximal interference graph, i.e.,
		  a graph that contains all intereferences possible in
		  various schedules. The register allocator does not
		  introduce antidependences, and therefore provides
		  maximum scheduling freedom. If the register
		  allocator thinks it will run out of registers, it
		  adds dependences to reduce the number of
		  interferences. The paper empirically studies various
		  heuristics for adding dependences, applied at
		  various stages of the register allocator. The best
		  one introduces dependences that remove the maximum
		  number of interferences already before building
		  the interference graph (based on a count of live
		  values). With this heuristic, scheduler-sensitive
		  register allocation is a little better than
		  integrated prepass scheduling for 20 registers or
		  more and a little worse for fifteen registers or
		  less. Unfortunately they do not compare these
		  methods with plain prepass scheduling. "
}

@InProceedings{jourdan+95,
  author = 	 "St\'ephan Jourdan and Pascal Sainrat and Diniel Litaize",
  title = 	 "Exploring Configurations of Functional Units in an
		  Out-Of-Order Superscalar Processor",
  crossref =	 "isca95",
  pages =	 "117--125",
  annote =	 "Using trace-based simulation, the authors measured
		  varying configurations of superscalar processors,
		  starting with the degree, then continuing with the
		  instruction window size, integer units and data cache
		  ports, variants of specialization for the integer
		  units (with and without shifter) and floating-point
		  units. "
}

@InProceedings{ando+95,
  author = 	 "Hideki Ando and Chikako Nakanishi and Tetsuya Hara
		  and Masao Nakaya",
  title = 	 "Unconstrained Speculative Execution with Predicated
		  State Buffering",
  crossref =	 "isca95",
  pages =	 "126--137",
  annote =	 "Instructions are not simply marked as speculative,
		  but already with the predicate they depend upon;
		  if such an instruction is executed before the
		  predicate is available, the result is
		  marked with the predicate and stored in a shadow
		  register instead of the main register, similar to
		  boosting. One shadow register file suffices in the
		  context of their compiler, which allocates register
		  to avoid conflicts. As soon as the predicate becomes
		  available, the shadow register either becomes the
		  main register (if the predicate is true), or is
		  dropped (if it is false). Exception recovery is
		  performed by hardware.  The authors have develooped
		  compiler techniques based on region scheduling to
		  exploit this architectural feature and present
		  empirical results."
}

@InProceedings{mahlke+95,
  author = 	 "Scott A. Mahlke and Richard E. Hank and James
		  E. McCormick and David I. August and {Wen-mei} W. Hwu",
  title = 	 "A Comparison of Full and Partial Predicated
		  Execution Support for {ILP} Processors",
  crossref =	 "isca95",
  pages =	 "138--149",
  annote =	 "Compares an architecture where every instruction is
		  predicated with another version of the architecture
		  that has only a conditional move. On a degree-8
		  machine with only one branch/cycle conditional moves
		  provide 30\% speedup, and full predication provides
		  30\% over conditional moves."
}

@InProceedings{simone+95,
  author = 	 "M. Simone et al.",
  title = 	 "Implementation Trade-Offs in Using a Restricted Data
		  Flow Architecture in a High Performance RISC
		  Microprocessor",
  crossref =	 "isca95",
  pages =	 "151--162",
  annote =	 "Discusses an aggressive out-of-order degree-4
		  superscalar implementation of the SPARC
		  architecture. Interestingly they implement only 4
		  register windows. Their instruction window can
		  contain 64 instructions; 38 integer and 48 32-bit FP
		  registers are available for register renaming. The
		  reservation stations for the integer, address
		  generation and FP units  each contain 8 entries, the
		  load/store unit 12. They also discuss the algorithm
		  for selecting instructions from the reservation
		  stations."
}

@InProceedings{diep+95,
  author = 	 "Trung A. Diep and Cristopher Nelson and John Paul Shen",
  title = 	 "Performance Evaluation of the PowerPC 620
		  Microarchitecture",
  crossref =	 "isca95",
  pages =	 "163--174",
  annote =	 "Presents empirical data about the utilization and
		  effectiveness of
		  various parts of the PPC~620 microarchitecture."
}

@InProceedings{bodin&Seznec95,
  author = 	 "Fran\c{c}ois Bodin and Andr\'e Seznec",
  title = 	 "Skewed Associativity Enhances Performance Predictability",
  crossref =	 "isca95",
  pages =	 "265--271",
  annote =	 "Skewed associative caches map a memory block into
		  one line in one bank of the cache and into a
		  different line in a different bank. If the mapping
		  functions are chosen right, memory locations
		  conflicting in one bank probably do not conflict in a
		  different bank, reducing conflict misses."
}

@InProceedings{young+95,
  author = 	 "Cliff Young and Nicolas Gloy and Michael D. Smith",
  title = 	 "A Comparative Analysis of Schemes for Correlated
		  Branch Prediction",
  crossref =	 "isca95",
  pages =	 "276--286",
  annote =	 "This excellent paper first introduces a model for
		  branch prediction: A prediction scheme divides a
		  program execution into substreams and feeds
		  each substream to a predictor. This framework is
		  then used to analyse and compare existing branch
		  prediction mechanisms, notably hardware- and
		  profiling-based \cite{young&smith94}
		  branch-correlation schemes. The differences are
		  isolated and analysed empirically: Path history
		  provides slightly better performance
		  than pattern history, aliasing (table conflicts)
		  significantly decreases performance, correlation
		  across calls and returns plays a significant role in
		  several benchmarks. The difference between a static
		  predictor and a dynamic 2-bit adapting predictor is
		  noticably in only few branches, but the misses
		  caused by these branches are significant, sometimes
		  favouring adaptivity, sometimes favouring static
		  prediction."
}

@InProceedings{calder&grunwald95,
  author = 	 "Brad Calder and Dirk Grunwald",
  title = 	 "Next Cache Line and Set Prediction",
  crossref =	 "isca95",
  pages =	 "287--296",
  annote =	 "Instead of storing the full target address in a
		  branch target buffer, it is cheaper to store just
		  the cache location of the branch target, enabling
		  target buffers with more entries (e.g., 1024 instead
		  of 128 at the same cost) and better performance."
}

@InProceedings{conte+95,
  author = 	 "Thomas M. Conte and Kishore N. Menezes and Patrick
		  M. Mills and Burzin A. Patel",
  title = 	 "Optimization of Instruction Fetch Mechanisms for
		  High Issue Rates",
  crossref =	 "isca95",
  pages =	 "333--344"
}

@InProceedings{uhlig+95,
  author = 	 "Richard Uhlig and David Nagle and Trevor Mudge and
		  Stuart Sechrest and Joel Emel",
  title = 	 "Instruction Fetching: Coping with Code Bloat",
  crossref =	 "isca95",
  pages =	 "345--356",
  annote =	 "Claims that the SPEC benchmarks are not
		  representative for current applications with respect
		  to instruction cache miss rate, proposes a new set
		  of programs for instruction cache benchmarking and
		  compares them empirically in this respect to the
		  SPEC programs."
}

@InProceedings{lee+95,
  author = 	 "Dennis Lee and Jean-Loup Baer and Brad Calder and
		  Benjamin Grunwald",
  title = 	 "Instruction Cache Fetch Policies for Speculative
		  Execution",
  crossref =	 "isca95",
  pages =	 "357--367",
  annote =	 "What should be done on an instruction cache miss for
		  speculative instructions? If we fetch from main
		  memory, what should be done if the speculation turns
		  out to be wrong during the access? If the latency is
		  large, speculative I-cache misses should not be
		  served. If the latency is small, the speculative
		  fetch should be performed; if the speculation is
		  wrong, execution of the right path should be resumed
		  without waiting for the wrong miss to complete."
}

@InProceedings{austin+95,
  author = 	 "Todd M. Austin and Dionisios Pnevmatikatos and
		  Gurindar S. Sohi",
  title = 	 "Streamlining Data Cache Access with Fast Address
		  Calculation",
  crossref =	 "isca95",
  pages =	 "369--380",
  annote =	 "Tries to shorten the average load latency by
		  predicting the effective address by or-ing instead of
		  adding the components together. Software support can
		  increase the number of successful predictions. The
		  number of correct predictions is surprisingly high,
		  surpassing 50\% for most benchmarks even without
		  software support and 95\% for most integer
		  benchmarks with software support."
}

@InProceedings{wang+95,
  author = 	 "Hong Wang and Tong Sun and Qing Yang",
  title = 	 "{CAT} --- Caching Address Tags. A Technique for
		  Reducing Area Cost of On-chip Caches",
  crossref =	 "isca95",
  pages =	 "381--390",
  annote =	 "Many cache entries have the same few tags. This
		  redundancy can be exploited by caching the tags
		  themselves in a small cache. Empirical data suggests
		  that 32 entries are sufficient for tags for a 16K
		  or larger direct-mapped cache. The CAT reduces the
		  size of the tag area by a factor of 2--6. If a CAT
		  entry has to be replaced, all cache lines refferring
		  to that tag have to be invalidated; the paper does
		  not discuss how to do this in a write-back cache."
}

@InProceedings{tullsen+95,
  author = 	 "Dean M. Tullsen and Susan J. Eggers and Henry
		  M. Levy",
  title = 	 "Simultaneous Multithreading: Maximizing On-Chip Parallelism",
  crossref =	 "isca95",
  pages =	 "392--403"
}

@InProceedings{sohi+95,
  author = 	 "Gurindar S. Sohi and Scott E. Breach and T. N. Vijaykumar",
  title = 	 "Multiscalar Processors",
  crossref =	 "isca95",
  pages =	 "414--425",
  annote =	 "Continues the work of \cite{franklin&sohi92}."
}

@Proceedings{isca95,
  title = 	 "$22^{nd}$ Annual International Symposium on Computer Architecture",
  booktitle = 	 "$22^{nd}$ Annual International Symposium on Computer Architecture",
  year = 	 "1995",
  key =		 "ISCA '22",
}

@Article{kogge82,
  author = 	 "Peter M. Kogge",
  title = 	 "An Architectural Trail to Threaded-Code Systems",
  journal =	 ieeecomputer,
  year =	 "1982",
  pages =	 "22--32",
  month =	 mar,
  annote =	 "Explains the design of (a classical
		  implementation of) Forth, starting with threaded
		  code, then adding the parameter stack, constants,
		  variables, control structures, dictionary, outer
		  interpreter and compiler."
}

@Article{epstein&gilliatt85,
  author = 	 "Arnold Epstein and Claire H. Gilliat",
  title = 	 "The {MAGIC/L} Programming Language",
  journal =	 jfar,
  year =	 "1985",
  volume =	 "3",
  number =	 "2",
  pages =	 "9--21",
  note =	 "1985 Rochester Forth Conference"
}

@InProceedings{briggs&cooper94,
  author = 	 {Preston Briggs and Keith D. Cooper},
  title = 	 {Effective Partial Redundancy Elimination},
  crossref =	 {sigplan94},
  pages =	 {159--170}
}

@InProceedings{knoop+94,
  author = 	 {Jens Knoop and Oliver R{\"u}thing and Bernhard Steffen},
  title = 	 {Partial Dead Code Elimination},
  crossref =	 {sigplan94},
  pages =	 {147--158}
}

@Proceedings{sigplan94,
  booktitle = 	 "SIGPLAN '94 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '94 Conference on Programming Language
		  Design and Implementation",
  year = 	 "1994",
  key = 	 "SIGPLAN '94"
}

@Book{fraser&hanson95,
  author = 	 {Christopher Fraser and David Hanson},
  title = 	 {A Retargetable C compiler: Design and Implementation},
  publisher = 	 {Benjamin/Cummings Publishing},
  year = 	 {1995},
  ISBN =	 {0-8053-1670-1}
}

@InProceedings{stanley&wedig87,
  author =       "Timothy J. Stanley and Robert G. Wedig",
  title =        "A Performance Analysis of Automatically Managed Top of
                 Stack Buffers",
  crossref =	 "isca87",
  pages =        "272--281",
  annote =	 "They propose the top-of-stack buffer as special
		  purpose cache for accesses to memory near the stack
		  pointer in a conventional architecture. They look at
		  three different algorithms for managing the buffer,
		  some of which utilize otherwise unused memory cycles
		  to manage the buffer proactively, avoiding delays
		  later. Data based on the dhrystone benchmark are
		  presented."
}

@Proceedings{isca87,
  key =		"ISCA-14",
  booktitle = 	"The $14^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  title = 	"The $14^{th}$ Annual International Symposium on
		 Computer Architecture (ISCA)",
  year = 	"1987",
  address =      "Pittsburgh, Pennsylvania",
  organization = "IEEE Computer Society TCCA and ACM SIGARCH",
  note =         "{\em Computer Architecture News,} 15(2), June 1987",
  month =        jun # " 2--5,",
}

@Article{kanner+65,
  author =       "H. Kanner and P. Kosinski and C. L. Robinson",
  title =        "The structure of yet another {ALGOL} compiler",
  journal =      cacm,
  volume =       "8",
  number =       "7",
  pages =        "427--438",
  month =        jul,
  year =         "1965",
  coden =        "CACMA2",
  ISSN =         "0001-0782",
  bibdate =      "Sun Sep 18 23:35:40 1994",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Compiler/bevan.bib
                 and
                 ftp://ftp.ira.uka.de/pub/bibliography/Compiler/Compiler.Lins.bib",
  abstract =     "A high-speed ``top down'' method of syntax analysis
                 which completely eliminates ``back-up'' of the source
                 string has been implemented in a convenient
                 macro-language. A technique of simulation at compile
                 time of the use of a conventional run-time stack
                 enables the generation of code for expressions which
                 minimizes stores, fetches and stack-pointer motion at
                 run time, while properly treating recursion and side
                 effects of procedures. Block structure and recursion
                 are handled without need for interpretive methods at
                 run time. The ``context problem'' in the transmission
                 to recursive procedures of parameters ``called by
                 name'' is solved in a manner which permits the handling
                 of common cases of simple expressions and array
                 identifiers with particular efficiency.",
  checked =      "19940407",
  sjb =          "Contains two good pieces of advice: (1) Do not bother
                 to mechanism those operations which are easily
                 performed by humans. (2) Do not perform at run time any
                 bookkeeping operations that can reasonably be performed
                 at compile time. The former led to the decision to
                 writing the lexer/parser as set of recursive routines
                 and the latter to the removal of any form of ``go to''
                 interpreter \cite{Irons:Feurzig:cacm:1961}. Notes that
                 the ALGOL report uses syntax to distinguish between
                 arithmetic and boolean expressions but that this causes
                 problems for their syntax analyser. The solution to the
                 problems was to unify the syntax and make
                 differentiating between the two types of expression a
                 typing problem. Rest of the paper details solutions to
                 the following areas: labels and multiple assignments;
                 run time lists for {\bf own} variables; dealing with
                 block structure using the symbol table; code generation
                 for expressions; dealing with switches and
                 procedures.",
  annote =	 "Discusses several technical issues in Algol~60
		  implementation; some of which are specific to the
		  language, but others are still interesting today
		  (e.g., how to deal with common prefixes in syntax
		  analysis). They apparently generate code for an
		  accumulator machine with several index registers
		  without stack addressing modes. The code generation
		  logically works with a stack in memory; however, the
		  compiler emulates most stack pointer updates at
		  compile-time and translates stack accesses into
		  indexed accesses."
}

@Article{dewar75,
  author =	 {Robert B.K. Dewar},
  title =	 {Indirect Threaded Code},
  journal =	 cacm,
  year =	 {1975},
  volume =	 {18},
  number =	 {6},
  month =	 jun,
  pages =	 {330--331},
  annote =	 {Demonstrates a version of indirect threaded code
                  with multiple code fields; it contrasts this with a
                  version of direct threading that has a separate
                  routine corresponding to each code field of the
                  indirect threaded code (i.e., no immediate
                  parameters).}
}

@Book{goldberg&robson83,
  author = 	 {Adele Goldberg and David Robson},
  title = 	 {Smalltalk-80: The Language and its Implementation},
  publisher = 	 {Addison-Wesley},
  year = 	 {1983}
}

@Manual{ansforth94,
  title = 	 "American National Standard for Information Systems:
		  Programming Languages: Forth",
  key =		 "ANS~Forth",
  organization = "American National Standards Institute",
  year =	 1994,
  note =	 "Document X3.215-1994",
  url =           {http://www.complang.tuwien.ac.at/forth/dpans-html/dpans.htm}
}

@Unpublished{hayes89,
  author = 	 "John Hayes",
  title = 	 "Design Tradeoffs in a Top of Stack Cache",
  note = 	 "Unpublished",
  year =	 "1989"
}

@Article{baden95,
  author = 	 "Wil Baden",
  title = 	 "Pinhole Optimization",
  journal =	 "Forth Dimensions",
  year =	 1995,
  volume =	 17,
  number =	 2,
  pages =	 "29--35"
}

@InProceedings{vitek&horspool96,
  author = 	 {Jan Vitek and R. Nigel Horspool},
  title = 	 {Compact Dispatch Tables for Dynamically Typed Object
                  Oriented Languages},
  crossref =	 {cc96},
  pages =	 {309--325}
}

@InProceedings{proebsting&whaley96,
  author = 	 {Todd A. Proebsting and Benjamin R. Whaley},
  title = 	 {One-Pass, Optimal Tree Parsing --- With or Without Trees},
  crossref =	 {cc96},
  pages =	 {294--308}
}

@Proceedings{cc96,
  title = 	 "Compiler Construction (CC'96)",
  booktitle = 	 "Compiler Construction (CC'96)",
  year = 	 "1996",
  key =		 "CC'96",
  editor =	 "Tibor Gyim\'{o}thy",
  OPTvolume =	 "1060",
  OPTseries =	 "LNCS",
  publisher =	 "Springer LNCS~1060",
  address =	 "Link{\"o}ping"
}

@InProceedings{evers+96,
  author = 	 "Marius Evers and Po-Yung Chang and Yale N. Patt",
  title = 	 "Using Hybrid Branch Predictors to Improve Branch
		  Prediction Accuracy in the Presence of Context
		  Switches",
  crossref =	 "isca96",
  pages =	 "3--11",
  annote =	 "They propose a new mechanism for selecting between
		  different branch predictors: For every BTB entry and
		  predictor they introduce a two-bit counter that
		  records how well this predictor did for this branch
		  relative to the other predictors. A predictor using
		  this mechanism outperforms other predictors for the
		  SPECint92 benchmarks. They also apply their
		  multi-hybrid predictor to traces that emulate
		  context switching by regularly flushing the
		  predictor. It also outperforms other predictors
		  there: Some predictors (e.g., 2-bit counters)
		  warm-up quickly, other predictors are more accurate,
		  but have a long warm-up phase."
}

@InProceedings{gloy+96,
  author = 	 "Nicolas Gloy and Cliff Young and J. Bradley Chen and
		  Michael D. Smith",
  title = 	 "An Analysis of Dynamic Branch Prediction Schemes on
		  System Workloads",
  crossref =	 "isca96",
  pages =	 "12--21",
  annote =	 "The paper evaluates the effectiveness of several
		  branch predictors for full-system traces (i.e., with
		  kernel branhces and other processes). These traces
		  contain many more static branches and therefore
		  increase aliasing. Consequently, for
		  a given implementation cost, schemes that use
		  shorter histories do better relative to user-only
		  traces. The paper also examines the practice of
		  modeling context switching and kernel activity by
		  regularly flushing the predictor: It concludes that
		  this model is misleading, because it does not
		  capture the differences in the organization and size
		  of the schemes; it assumes that all models have the
		  same contention."
}

@InProceedings{sechrest+96,
  author = 	 "Stuart Sechrest and Chih-Chieh Lee and Trevor Mudge",
  title = 	 "Correlation and Aliasing in Dynamic Branch Predictors",
  crossref =	 "isca96",
  pages =	 "22--32",
  annote =	 "Examines the performance of various predictors for
		  traces of larger programs. They conclude that PAs
		  schemes perform relatively better than studies based
		  on SPEC benchmarks indicate, especially if the full
		  design space of these schemes is explored."
}

@InProceedings{nayfeh+96,
  author = 	 "Basem A. Nayfeh and Lance Hammond and Kunle Olukotun",
  title = 	 "Evaluation of Design Alternatives for a
		  Multiprocessor Microprocessor",
  crossref =	 "isca96",
  pages =	 "67--77",
  annote =	 "The basic assumption here is that, in the future,
		  several processors will reside on one chip. How
		  should they be interfaced to the memory hierarchy?
		  Should they share the first-level cache, the
		  second-level cache, or only memory? For a quite
		  unrealistic model (sharing caches by having an
		  $n\times n$ crossbar), they conclude that a
		  shared-L1 architecture is best, even for
		  multiprogramming workloads
		  where no user-data is shared between
		  processors. Assuming a higher L1 latency for the
		  shared L1 cache makes the shared-memory scheme best
		  for multiprogramming workloads. For data-sharing
		  application, the shared-L1 scheme outperforms the
		  others (except for MP3D, where the shared-L2 scheme
		  is best)."
}

@InProceedings{burger+96,
  author = 	 "Doug Burger and James R. Goodman and Alain K{\"a}gi",
  title = 	 "Memory Bandwidth Limitations of Future Microprocessors",
  crossref =	 "isca96",
  pages =	 "78--89",
  annote =	 "The widening gap between processor speeds and memory
		  speed is currently often closed with techniques that
		  hide latency, but consume bandwidth (e.g.,
		  prefetching). This paper predicts that, with this
		  trend, bandwidth will soon become a problem. We
		  would have to create chips with thousands or
		  tenthousands of pins that have to be drive at a GHz
		  or more. They also present empirical data on the
		  effectiveness of caches at multiplying external
		  bandwidth; they compare usual (LRU replacement,
		  32-byte lines) caches with optimal caches. They
		  conclude that, in the short run, smarter caches (in
		  particular, caches with shorter lines) will
		  alleviate this problem, while in the long run, all memory
		  will be on-chip."
}

@InProceedings{seznec96,
  author = 	 "Andr\'e Seznec",
  title = 	 "Don't use the page number, but a pointer to it",
  crossref =	 "isca96",
  pages =	 "104--113",
  annote =	 "Page numbers are currently stored both in the TLB
		  and in the caches. Instead of storing them several
		  times the author propses to save chip area by
		  keeping them only once in a
		  page number table, and to use an index into this
		  table in the TLB and the caches. He shows how this
		  can be implemented for several combinations of
		  virtual and physical indexing and tagging."
}

@InProceedings{Juan+96,
  author = 	 "Toni Juan and Tomas Lang and Juan J. Navarro",
  title = 	 "The Difference-Bit Cache",
  crossref =	 "isca96",
  pages =	 "114--120",
  annote =	 "Proposes a scheme for a two-way set-associative
		  chache that is faster than other schemes. The
		  problem with this scheme seems to be that it is only
		  fast for virtual-tagged caches."
}

@InProceedings{tullsen+96,
  author = 	 "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer
		  and Henry M. Levy and Jack. L. Lo and Rebecca L. Stamm",
  title = 	 "Exploiting Choice: Instruction Fetch and Issue on an
		  Implementable Simultaneous Multithreading Processor",
  crossref =	 "isca96",
  pages =	 "191--202",
  annote =	 "An SMT processor exploits the resources available in
		  a superscalar processor better by running several
		  threads on the processor simultaneously. All the
		  resources are shared between the threads, including
		  the register set (which is larger, however), except
		  for the program counter (and a return stack),
		  retirement and related stuff. This paper
		  concentrates on instruction fetching, which seems to
		  have been identified as bottleneck in an earlier
		  paper. They propose and measure several heuristics
		  for selecting the thread from which to fetch. The
		  best one is selecting the thread that has the least
		  instructions in the instruction queues. With this
		  heuristic an 8-thread 8-issue architecture achieves a
		  throughput of 5.4 instructions/cycle."
}

@InProceedings{hara+96,
  author = 	 "Tetsuya Hara and Hideki Ando and Chikako Nakanishi
		  and Masao Nakaya",
  title = 	 "Performance Comparisons of ILP Machines with Cycle
		  Time Evaluation",
  crossref =	 "isca96",
  pages =	 "213--224",
  annote =	 "Presents a VLIW with a meachnism called predicating
		  that looks quite similar to boosting."
}

@Proceedings{isca96,
  title = 	 "$23^{rd}$ Annual International Symposium on Computer Architecture",
  booktitle = 	 "$23^{rd}$ Annual International Symposium on Computer Architecture",
  year = 	 "1996",
  key =		 "ISCA '23",
}

@TechReport{moore&leach70,
  author = 	 "Charles H. Moore and Geoffrey C. Leach",
  title = 	 "FORTH -- A Language for Interactive Computing",
  institution =  "Mohasco Industries, Inc.",
  year = 	 "1970",
  address =	 "Amsterdam, NY",
  url = 	 "http://www.ultratechnology.com/F70POST.ZIP",
  url =		 "http://www.ultratechnology.com/4th_1970.html",
  annote =	 "Describes Forth, as it was in 1970. There are
		  surprising differences from and surprising
		  similarities with modern Forth systems. The system
		  they describe uses text interpretation instead of
		  threaded code for definitions, although there is
		  already a code field, i.e., the foundation for indirect
		  threading. During the interpretation of a
		  definition, only words defined earlier are visible,
		  like in modern Forths, and in contrast to
		  Postscript. To make text interpretation speed
		  bearable, the dictionary is implemented as a hash
		  table with external chaining. There is support for
		  portably generating CODE words (called verbs in the
		  paper). The syntax is a bit more complicated than
		  today: special characters may only come as first
		  character in a word, so words are not only separated
		  by spaces. The system already features multitasking
		  (round-robin, with preemption). Source is stored in
		  screens (then called sheets) containing 50 lines by
		  40 characters. The programs look markedly different
		  than today because the primary stack manipulation
		  words are @T (similar to PICK) and =T (similar to a
		  word sometimes called STICK). Forth was running on
		  the IBM~1130 and the Burroughs B-5500 (different
		  cell sizes). The paper also observes that compactness of
		  programs ``arises through the economies of tailoring
		  definitions to a specific application'', and is more
		  pronounced in larger programs."
}

@TechReport{ans96rfi7,
  author = 	 {TC X3J14},
  title = 	 {Clarifying the distinction between
                  ``immediacy'' and ``special compilation
                  semantics''},
  institution =  {ANSI TC X3J14},
  year = 	 {1996},
  type =	 {RFI response},
  number =	 {X3J14/Q0007R}
}

@TechReport{ans99rfi-state,
  author = 	 {TC X3J14},
  title = 	 {Regarding compilation while in Interpretation state},
  institution =  {ANSI TC X3J14},
  year = 	 {1999},
  type =	 {RFI response},
  number =	 {Q99-027},
  url =          {http://www.minerva.com/x3j14/queries/a99-027.txt}
}

@Article{goodwin&wilken96,
  author = 	 {David W. Goodwin and Kent D. Wilken},
  title = 	 {Optimal and Near-optimal Global Register Allocation
                  Using 0-1 Integer Programming},
  journal = 	 spe,
  year = 	 {1996},
  volume =	 {26},
  number =	 {8},
  month =	 august,
  pages =	 {929--965},
  annote =	 {Describe global (intraprocedural) register
                  allocation (with rematerialization, without live
                  range splitting) as integer programming problem, and use a
                  solver to get an optimal solution. The formulation
                  as interger problem is pretty straight-forward:
                  There is one variable for every live range part and
                  real register, where live ranges are divided into
                  parts at definition, use, load, and store points;
                  the main nontrivial point here is that store points
                  are only necessary right after control flow splits
                  and after the definition; load points are only
                  necessary right before control flow joins and before
                  uses. The paper also presents very encouraging
                  results on the SPECint92 benchmarks for the
                  Precision Architecture. The spill overhead is
                  reduced dramatically, resulting in a 0\%--10\%
                  speedup over GCC's original register allocator and
                  a Briggs-style allocator. The register allocation
                  times are quite long, but not exponential;
                  empirically, they show $n^3$-complexity with respect
                  to the number of instructions. The register
                  allocation times can be reduced by an order of
                  magnitude with little degradation in allocation
                  quality.}
}

@InProceedings{ruttenberg+96,
  author = 	 {John Ruttenberg and G. R. Gao and A. Stoutchinin and
                  W. Lichtenstein},
  title = 	 {Software Pipelining Showdown: Optimal vs. Heuristic
                  Methods in a Production Compiler},
  crossref =	 {sigplan96},
  pages =	 {1--11},
  annote =	 {The heuristic software pipeliner of the MIPSpro compiler for
                  the R8000 is compared to the optimal MOST software
                  pipeliner on the SPECFP and other benchmarks. The
                  MIPSpro pipeliner is based on modulo
                  scheduling. Surprisingly, the MIPSpro compiler
                  performs as well as MOST for the scheduling itself,
                  and due to better modeling (memory bank contention),
                  it outperforms MOST overall. The authors state that
                  there is still work to be done on  loops with low
                  trip counts.}
}

@InProceedings{lee&leone96,
  author = 	 {Peter Lee and Mark Leone},
  title = 	 {Optimizing ML with Run-Time Code Generation},
  crossref =	 {sigplan96},
  pages =	 {137--148},
  annote =	 {They compile curried functions into code generators
                  that generate specialized functions when applied to the
                  first part of the arguments. The code generators are
                  very fast, at 4--6 executed instructions per generated
                  instruction. As a result, the break-even for using
                  this feature for optimization can occur very early.
                  They present results on a few benchmarks, mainly
                  matrix multiplication and packet filtering. The
                  break-evens are at $20 \times 20$ dense matrices
                  (better for sparse matrices) and at
                  250 packets, the asymptotic speedups are impressive
                  (about 1.7 for dense matrices, 8 for sparse matrices,
                  and for packet filtering 1.5 over the Berkeley C
                  code.}
}

@InProceedings{eichenberger&davidson96,
  author = 	 {Alexandre E. Eichenberger and Edward S. Davidson},
  title = 	 {A Reduced Multipipeline Machine Description that
                  Preserves Scheduling Constraints},
  crossref =	 {sigplan96},
  pages =	 {12--22},
  annote =	 {Representing scheduling constraints as finite state
                  machines works for list scheduling, but not for
                  algorithms (like most modulo scheduling schemes)
                  that do not schedule cycle-by-cycle. In general,
                  scheduling constraints can be represented by
                  resource reservation tables. This paper describes
                  how to reduce straight-forward reservation tables to
                  smaller reservation tables that represent the same
                  scheduling constraints.}
}

@InProceedings{bruggeman+96,
  author = 	 {Carl Bruggeman and Oscar Waddel and R. Kent Dybvig},
  title = 	 {Representing Control in the Presence of One-Shot
                  Continuations},
  crossref =	 {sigplan96},
  pages =	 {99--107},
  annote =	 {Discuss how to represent multi-shot continuations
                  and one-shot continuations in a stack-based Scheme
                  implementation. One-shot continuations are
                  programmer-specified (with the \emph{call/1cc}
                  call). They offer a small (about 13\% in the threads
                  benchmarks) performance benefit over multi-shot
                  continuations, if they are applicable.}
}

@InProceedings{engler96,
  author = 	 {Dawson R. Engler},
  title = 	 {\textsc{vcode}: A Retargetable, Extensible, Very
		  Fast Dynamic Code Generation System},
  crossref =	 {sigplan96},
  pages =	 {160--170},
  annote =	 {Describes a low-level interface and system for dynamic code
		  generation. Code is generated for a function at a
		  time. The interface provides essentially commands
		  for generating code for an abstract RISC
		  architecture; it also helps with the calling
		  convention and with register allocation, i.e., all
		  basic machine dependences. Code is produced
		  separately for each virtual instruction, resulting in
		  high code generation speed (10 instructions per
		  generated instruction), but low run-time
		  performance. Run-time performance can be achieved by
		  using machine-specific knowledge and features (e.g.,
		  there are features for delay slot filling).}
}

@InProceedings{auslander+96,
  author = 	 {Joel Auslander and Matthai Philipose and Craig
		  Chambers and Susan J. Eggers and Brian N. Bershad},
  title = 	 {Fast, Effective Dynamic Compilation},
  crossref =	 {sigplan96},
  pages =	 {149--159},
  annote =	 {Describes a compiler for C with annotations for
		  dynamic code generation for specialization to
		  certain run-time constants. The paper discusses
		  mainly the analysis necessary to determine the
		  run-time constants and the control-flow. The
		  run-time code generator optimizes quite a bit, and
		  also has some inefficiencies, leading to relatively
		  slow run-time code generation, late break-even
		  points, but good asymptotic speedups. The
		  annotations are not safe, i.e., a program can be
		  broken by adding the wrong annotations.}
}

@InProceedings{ramsey96,
  author = 	 {Norman Ramsey},
  title = 	 {Relocating Machine Instructions by Currying},
  crossref =	 {sigplan96},
  pages =	 {226--236},
  annote =	 {The paper describes how to derive  the relocation
		  information and relocation functions for a machine
		  automatically from a description of the machine
		  code, eliminating the machine-dependence and
		  redundant information present in traditional
		  linkers. The linker and object files can be seen
		  as the residual program of a partially evaluated
		  assembler (the locations of some symbols are still
		  missing). There is no need to have a residual
		  function for each relocation point, there is just a
		  closure for each relocation point, and all closures
		  share a few functions. These functions correspond to
		  traditional linker transformations, and the closures
		  to traditional relocation items in object files.}
}

@InProceedings{jagannathan&wright96,
  author = 	 {Suresh Jagannathan and Andrew Wright},
  title = 	 {Flow-directed Inlining},
  crossref =	 {sigplan96},
  pages =	 {193--205},
  annote =	 {Describes inlining in a larger context: flow
		  analysis, the selection of inlining sites, and
		  simplification are described in detail, and
		  empirical results are given.}
}

@InProceedings{ramalingam96,
  author = 	 {G. Ramalingam},
  title = 	 {Data Flow Frequency Analysis},
  crossref =	 {sigplan96},
  pages =	 {267--277},
  annote =	 {Extends conventional data flow analysis from
		  qualitative information to quantitative information
		  about the frequency and/or the probability of
		  facts. A theoretical paper.}
}

@Proceedings{sigplan96,
  booktitle = 	 "SIGPLAN '96 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '96 Conference on Programming Language
		  Design and Implementation",
  year = 	 "1996",
  key = 	 "PLDI '96"
}

@Article{proebsting95toplas,
  author = 	 "Todd A. Proebsting",
  title = 	 "{BURS} Automata Generation",
  journal =	 toplas,
  year =	 "1995",
  volume =	 "17",
  number =	 "3",
  pages =	 "461--486",
  month =	 may,
  annote =	 "The journal version of \cite{proebsting92}."
}

@InProceedings{wendt90,
  author = 	 "Alan L. Wendt",
  title = 	 "Fast Code Generation Using Automatically-Generated
		  Decision Trees",
  pages =	 "9--15",
  booktitle =	 "SIGPLAN '90 Conference on Programming Language
		  Design and Implementation",
  year =	 "1990",
  annote =	 "Describes code generation based on DAG
		  rewriting. The code generators described in the
		  paper are particularly fast, because the code
		  generator generator combines several rules into
		  profitable combinations in a preprocessing step. A
		  training run is used to determine which combinations
		  play a role in practice. This optimization halves
		  the number of rule applications, and even the number
		  of rules is slightly reduced."
}

@Article{briggs&torczon93,
  author = 	 "Preston Briggs and Linda Torczon",
  title = 	 "An Efficient Representation for Sparse Sets",
  journal =	 "ACM Letters on Programming Languages and Systems",
  year =	 "1993",
  volume =	 "2",
  number =	 "1--4",
  pages =	 "59--69",
  annote =	 "Describes a set representation that is
		  asymptotically more
		  time-efficient than the classical bit-vector for
		  operations like \emph{clear-set} and \emph{forall},
		  but needs much more memory. The paper also presents
		  empirical data from micro-benchmarks that shows that
		  the \emph{member}, \emph{add-member} and
		  \emph{delete-member} operations are about three
		  times slower on an RS/6000 with the new
		  representation than with the bit-vector
		  representation. It also presents empirical data from
		  compilations of several routines, that shows that
		  the new representation significantly reduces the
		  register allocation time for these routines."
}

@Article{appel94toplas,
  author = 	 "Andrew W. Appel",
  title = 	 "Axiomatic Bootstrapping: A Guide for Compiler Hackers",
  journal =	 toplas,
  year =	 "1994",
  volume =	 "16",
  number =	 "6",
  pages =	 "1699-1718",
  month =	 nov,
  annote =	 "Enhances the T-diagram formalism with constraints
		  (called axioms in the paper), which allow a more
		  precise and less restrictive description of boot-strapping
		  cross-compilation than opaque T-diagrams. This
		  approach is explained with examples from SML/NJ
		  compilation. However, the paper did not convince me
		  that using this formalism makes solving such
		  problems easier."
}

@InProceedings{olukotun+96,
  author = 	 "Kunle Onlukotun and Basem A. Nayfeh and Lance
		  Hammond and Ken Wilson and Kunyung Chang",
  title = 	 "The Case for a Single-Chip Multiprocessor",
  crossref =	 "asplos96",
  pages =	 "2--11",
  annote =	 "Present a convincing argument why doing wider
		  issue is not very cost-effective: The R5000
		  (single-issue for integers) has 70\% of the SPECints
		  of the 4-issue R10000. They propose multiple
		  processors per chip as an alternative and do an
		  empirical comparison of a $4\times2$-way
		  multiprocessor with separate primary caches and a
		  shared on-chip secondary cache against a 6-way
		  superscalar with the same amount of caches."
}

@InProceedings{grunwald&neves96,
  author = 	 "Dirk Grunwald and Richard Neves",
  title = 	 "Whole-Program Optimization for Time and Space
		  Efficient Threads",
  crossref =	 "asplos96",
  pages =	 "50--59",
  annote =	 "Presents two optimizations for threads:
		  \begin{itemize}\item The stack space
		  needed for a thread is computed from the call graph
		  of the thread (back edges, i.e., recursion is
		  handled by allocating a new stack segment when
		  crossing the edge). The resulting stack segments are
		  usually much smaller than with the original policy of
		  starting with a page-sized segment, resulting in
		  fewer TLB misses. \item For cooperative context
		  switches, the context switch overhead is reduced by
		  saving and restoring only registers that are live at
		  that point in the respective threads.\end{itemize}
		  They present empirical results that show speedups of
		  12\%--21\% for the stack space optimization,
		  1\%--15\% for the context switch optimization and
		  15\%--35\% for the combination."
}

@InProceedings{philbin+96,
  author = 	 "James Philbin and Jan Edler and Otto J. Anshus and
		  Craig C. Douglas and Kai Li",
  title = 	 "Thread Scheduling for Cache Locality",
  crossref =	 "asplos96",
  pages =	 "60--71",
  annote =	 "Multi-Threading can improve performance by allowing
		  the scheduler to order the threads in a
		  cache-conscious way. This paper presents a
		  cache-conscious thread scheduling algorithm and
		  empirical results for several applications."
}

@InProceedings{chen+96asplos,
  author = 	 "Peter M. Chen and Wee Teck Ng and Subhachandra
		  Chandra and Christopher Aycock and Gurushankar
		  Rajamani and David Lowell",
  title = 	 "The Rio File Cache: Surviving Operating System Crashes",
  crossref =	 "asplos96",
  pages =	 "74--83",
  annote =	 "Battery-backed and write-protected RAM is just as
		  persistent as disks, but faster. So, it can be used
		  as a write-back cache that provides as much
		  reliability as synchronuous writes provide now, with
		  substantially improved performance. They
		  performed crash tests that indicate that such
		  a cache without write-protection is almost as
		  reliable as a write-through system and with
		  protection such a cache is more reliable. The
		  performance is similar to a memory file system,
		  is much better than systems with delayed
		  write-through (while having better reliability), and
		  the advantage over synchronuous write-through
		  systems is even higher."
}

@InProceedings{mckinley&temam96,
  author = 	 "Kathryn S. McKinley and Olivier Temam",
  title = 	 "A Quantitative Analysis of Loop Nest Locality",
  crossref =	 "asplos96",
  pages =	 "94--104",
  annote =	 "An empirical check of popular assumptions of
		  locality characteristics of numerical programs (the
		  Perfect Benchmarks), inparticular within loops. The
		  results are that loop nests have different locality
		  characteristics than the whole program and that some
		  popular assumptions do not hold."
}

@InProceedings{huang&shen96,
  author = 	 "Andrew S. Huang and John Paul Shen",
  title = 	 "The Intrinsic Bandwidth Requirements of Ordinary Programs",
  crossref =	 "asplos96",
  pages =	 "105--114",
  annote =	 "This paper proposes using perfect caches of various
		  sizes to estimate the main memory bandwidth
		  requirements of specific programs. It performs such
		  measurements for processors with various issue
		  widths. Not surprisingly, different benchmarks have
		  different bandwidth spectra (in some cases, even
		  the instruction bandwidth spectrum is data
		  dependent), and most of the time the bandwidth
		  requirements increase linearly with issue rate
		  (although the authors see it differently)."
}

@InProceedings{seznec+96,
  author = 	 "Andre\'e Seznec and St\'ephan Jourdan and Pascal
		  Sainrat and Pierre Michaud",
  title = 	 "Multiple-Block Ahead Branch Predictors",
  crossref =	 "asplos96",
  pages =	 "116--127",
  annote =	 "With increasing ILP, processors will have to predict
		  not only the next branch, but later branches, to
		  avoid a branch processing bottleneck. This paper
		  describes and evaluates a mechanism for predicting
		  two branches ahead."
}

@InProceedings{chen+96,
  author = 	 "I-Cheng K. Chen and John T. Coffey and Trevor N. Mudge",
  title = 	 "Analysis of Branch Prediction via Data Compression",
  crossref =	 "asplos96",
  pages =	 "128--137",
  annote =	 "Explains two-level branch predictors as special
		  cases of a data compression scheme (prediction by
		  partial matching) and compares them
		  with a predictor derived from an optimal version of
		  that compression scheme. The current schemes are
		  already close to this optimal predictor."
}

@InProceedings{lipasti+96,
  author = 	 "Mikko H. Lipasti and Christopher B. Wilkerson and
		  John Paul Shen",
  title = 	 "Value Locality and Load Value Prediction",
  crossref =	 "asplos96",
  pages =	 "138--147",
  annote =	 "Starts with the surprising observation that more
		  than half of the dynamic loads (on both PPC and
		  Alpha) get the same value
		  that they loaded the last time that static load was
		  executed. They exploit this fact with a mechanism
		  that reduces load latency for correctly predicted
		  values to zero (with a one-cycle penalty on
		  misprediction). They also propose a mechanism that
		  completely avoids an access to the memory hierarchy
		  for highly predictable loads, but this mechanism
		  does not look very cost-effective to me. They show
		  how their mechanisms could be integrated into the
		  21164 and into the PPC~620, and that they (in their
		  simple forms) would provide a speedup of 6\% for the
		  21164 and 3\% for the PPC~620."
}

@InProceedings{romer+96,
  author = 	 "Theodore H. Romer and Dennis Lee and Geoffrey
		  M. Voelker and Alec Wolman and Wayne A. Wong and
		  Jean-Loup Baer and Brian N. Bershad and Henry M. Levy",
  title = 	 "The Structure and Performance of Interpreters",
  crossref =	 "asplos96",
  pages =	 "150--159",
  annote =	 "They analyse the behaviour of four interpreters:
		  MIPSI, Java, Perl, and Tcl; None of these
		  interpreters (except, perhaps, MIPSI) seems to be
		  written for high performance (the slowdown over C
		  for the DES benchmark starts at a factor of 60 for
		  MIPSI and is at 4000 for Tcl). MIPSI has a low
		  I-cache miss rate, the I-cache misses in Java are
		  due to library calls, and Perl and Tcl have high
		  I-cache miss rates due to their large virtual
		  machines. The authors conclude that interpreters
		  have no significant differences from other
		  applications and that there is no need for
		  architectural support for them. They should rather
		  be improved at the software level."
}

@InProceedings{luk&mowry96,
  author = 	 "Chi-Keung Luk and Todd C. Mowry",
  title = 	 "Compiler-Based Prefetching for Recursive Data Structures",
  crossref =	 "asplos96",
  pages =	 "222--233",
  annote =	 "Proposes and evaluates three approaches for
		  prefetching pointer-based data structures:
		  \emph{Greedy prefetching} prefetches from all
		  pointers in the node when the node is
		  visited. \emph{History-pointer prefetching} uses
		  an additional pointer in each node for each kind of
		  walkthrough, that points to the node that was
		  fetched $n$ nodes later on the last walk, and
		  prefetches from that pointer.
		  \emph{data-linearization prefetching} puts the nodes
		  in an array and prefetches the node that is $n$
		  nodes further in the array; it works only if the
		  nodes are walked in the same order in which they
		  were built. They implemented greedy prefetching in a
		  compiler, and applied the other methods
		  manually. Greedy prefetching has little effect on
		  most benchmarks, but has a significant effect on the
		  three benchmarks that have a high fraction of load
		  stalls (up to 31\% reduction in
		  cycles). History-pointer prefetching was even more
		  effective for the benchmark for which it was used
		  (51\% reduction in cycles). Data-linearization
		  prefetching also outdoes greedy prefetching for the
		  two benchmarks where it was applied (19\%--22\%
		  fewer cycles than the original)."
}

@Proceedings{asplos96,
  title = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-VII)",
  booktitle = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-VII)",
  year = 	 "1996",
  key =		 "ASPLOS-VII"
}

@MastersThesis{pirker95,
  author =      {Christian Pirker},
  title =       {{{\"U}bersetzung von Forth in Maschinensprache}},
  school =      {{Technische  Universit\"{a}t Wien}},
  type =        {Diplomarbeit},
  year =        {1995},
  address =     {Austria},
  url =		{http://www.complang.tuwien.ac.at/Diplomarbeiten/pirker95.ps.gz},
  note =	{In German},
  abstract =    {Forth is an extensible and interactive language. It
                provides two programmer-visible stacks (data and
                returnstack). The supplied instructions (words)
                manipulate data on the stacks. The efficiency of the
                stack access and control flow determine mainly the
                performance of Forth implementations.\par This thesis
                builds a compiler that generates native code for
                {\em MIPS RISC processors}. The compiler translates
                Forth programs into native code using state of the
                art compiler technology. The code is directly
                executable on the processor.\par The compiler
                generates a {\bf data flow graph} for each basic
                block of the program. Then simple {\bf instruction
                selection}, {\bf instruction scheduling} and {\bf
                register allocation} algorithms produce the native
                code. The algorithms try to reduce the stack
                operations and eliminate unneccesary stack pointer
                updates.\par The native code compiler is written in
                Forth and can compile itself. The compiler is
                integrated into the interpreter. Therefore it also
                handles interpreter words.\par Forth programs
                compiled by this compiler run about 13 to 196 \%
                faster compared to interpreted programs. Currently
                compiling takes 220 \% longer than compiling into
                interpreting code.}
}

@InProceedings{gloy+95,
  author = 	 "Nicolas Gloy and Michael D. Smith and Cliff Young",
  title = 	 "Performance Issues in Correlated Branch Prediction Schemes",
  crossref =	 "micro95",
  pages =	 "3--14",
  annote =	 "Evaluates the effect of static correlated branch
		  prediction (and its code expansion), code layout and
		  branch alignment on I-cache misses, branch
		  mispredictions and branch misfetches. The code
		  expansion of static correlation is significant,
		  especially the expansion of the portion that is
		  actually executed. The I-cache miss rate is reduced
		  significantly by code layout, and increases slightly
		  with increasing history length in static
		  correlation. Branch prediction accuracy increases a
		  little from static correlation, but dynamic
		  correlation (with gshare) is usually better. The
		  combination of the optimizations is measured for
		  three machine models, using a \emph{cycles saved per
		  1000 instructions} metric. The best results are
		  usually achieved with modest history lengths,
		  sometimes even with 0 history (i.e., no static
		  correlation, only code layout and branch
		  alignment). The combination of the optimizations
		  outperforms gshare without such compiler support on
		  21164-like and PA-8000-like machine models."
}

@InProceedings{nair95,
  author = 	 "Ravi Nair",
  title = 	 "Dynamic Path-Based Branch Prediction",
  crossref =	 "micro95",
  pages =	 "15--23",
  annote =	 "Compares dynamic path-based branch prediction with
		  dynamic pattern-based branch prediction. At the
		  same hardware cost, there is little difference for long
		  flush intervals (flushing was used to simulate
		  context switching effects). But path-based schemes
		  have a shorter training time for the same prediction
		  accuracy, and therefore perform a little better at
		  high flush rates."
}

@InProceedings{calder+95,
  author = 	 "Brad Calder and Dirk Grunwald and Amitabh Srivastava",
  title = 	 "The predictability of Branches in Libraries",
  crossref =	 "micro95",
  pages =	 "24--34",
  annote =	 "Finds the following: The (branching) behaviour of
		  library code in one program can be 
		  predicted well by profiling other programs that make
		  significant use of the library. Optimizing the
		  library with this information can improve the
		  performance of application programs significantly
		  (depending on the amount of time that the
		  application spends in the library), without
		  additional cost to the application programmer. Using
		  such optimized libraries with a profile-optimized
		  main program is almost as good as profile-optimizing
		  the program comlete with the library. The heuristics
		  of \cite{ball&larus93} sometimes fail miserably (for
		  the Digital Unix libm in this case)."
}

@Proceedings{micro95,
  title = 	 "International Symposium on Microarchitecture (MICRO-28)",
  booktitle = 	 "International Symposium on Microarchitecture (MICRO-28)",
  year = 	 "1995",
  key =		 "MICRO-28"
}

@InProceedings{Adl-Tabatai96,
  author = 	 "Ali-Reza Adl-Tabatabai and Thomas Gross and
		  Guei-Yuan Lueh",
  title = 	 "Code Reuse in an Optimizing Compiler",
  crossref =	 "oopsla96",
  pages =	 "51--68",
  annote =	 "Present some ways to reuse code in a compiler. Code
		  examples in C++ are given. Not very spectacular,
		  but, e.g., in register allocation this paper is the
		  AFAIK first publication that gives a systematic view of
		  the commonalities and differences of various
		  register allocation methods. One nice feature of the
		  compiler is that the same compiler binary can
		  generate code for different architectures. The
		  mapping of everything to classes which could just
		  as well (or better) be expressed with conventional
		  programming constructs (e.g., procedure variables)
		  reinforces my impression that much of the OO stuff
		  is just hype."
}

@InProceedings{dean+96,
  author = 	 "Jeffrey Dean and Greg DeFouw and David Grove and
		  Vassily Litvinov and Craig Chambers",
  title = 	 "Vortex: An Optimizing Compiler for Object-Oriented
		  Languages",
  crossref =	 "oopsla96",
  pages =	 "83--100",
  annote =	 "Presents an intermediate code and optimizing
		  compiler back-end for object oriented languages,
		  combined with front-ends for several languages. The
		  intermediate representation represents
		  object-oriented concepts directly, not through
		  lower-level concepts. This allows better
		  optimization of object-oriented concepts. These
		  optimization achieve a speedup of up to 10 for Cecil
		  programs, and 10\%--30\% for Java, C++, and Modula-3
		  programs. Also contains other interesting empirical
		  data about the benchmarks they used."
}

@InProceedings{moore96,
  author = 	 "Ivan Moore",
  title = 	 "Automatic Inheritance Hierarchy Restructuring and
                  Method Refactoring",
  crossref =	 "oopsla96",
  pages =	 "235--250",
  annote =	 "Restructures the hierarchy by first removing it
                  (putting all attributes in all offspring classes of
                  an eliminated class), then creates a new hierarchy
                  by making sets of attributes into classes. Methods
                  are refactored by a kind of common subexpression
                  elimination. Refactoring is done over all methods,
                  then the new hierarchy is built. The work is based
                  on Self, which makes some of these things
                  particularly easy. Refactoring and restructuring are
                  applied to three hierarchies; they discovered some
                  new classes and reduced the code size (measured as
                  number of message sends) somewhat, but basically did
                  not change the structure very much."
}

@InProceedings{diwan+96,
  author = 	 "Amer Diwan and J. Eliot B. Moss and Kathryn S. McKinley",
  title = 	 "Simple and Effective Analysis of Statically-Typed
		  Object-Oriented Programs",
  crossref =	 "oopsla96",
  pages =	 "292--305",
  annote =	 "Determines the effectiveness of four methods for
		  detecting whether a method invocation is monomorphic
		  (always calls the same procedure): \emph{Type hierarchy
		  analysis} uses only information present in the class
		  and method declaration. \emph{Intraprocedural type
		  propagation} is a data flow analysis that can determine
		  the type of the method recipient more accurately.
		  \emph{Aggregate analysis} determines whether a container
		  datatype contains just data of one
		  type.  \emph{Interprocedural type propagation} used
		  in this paper is a context-insensitive
		  interprocedural version of type propagation. The
		  language used in the paper is Modula-3, where NULL
		  is a subtype of all types and overrides every method
		  with the error method.  Consequently, type hierarchy
		  analysis can never determine that an invocation is
		  monomorphic. For the benchmarks used, only
		  intraprocedural and interprocedural type propagation
		  yielded significant improvements, detecting that
		  up to 35\% of the dynamic method invocations were
		  monomorphic.  To simulate the
		  semantics of other languages, the authors also
		  performed experiments that ignored the NULL
		  type. These experiments showed type hierarchy
		  analysis to be very effective, with aggregate
		  analysis and interprocedural type propagation
		  providing siginficant improvements in a few
		  cases. Up to 95\% of all calls were detected to be
		  monomorphic. In both variants, the whole program
		  was assumed to be available for the analysis.  The
		  paper then looks at the causes for not being able to
		  determine monomorphism (when ignoring NULL): In the
		  majority of cases, the invocation was actually
		  polymorphic. In the other cases, the reasons varied
		  with the benchmark.  The paper also discusses the
		  reasons for polymorphic invocations and
		  transformations that may resolve them. 50\% of all
		  dynamic method invocations are less than 60
		  instructions from each other (before analysis and
		  any optimization derived from it)."
}

@InProceedings{driesen&hoelzle96,
  author = 	 "Karel Driesen and Urs H{\"o}lzle",
  title = 	 "The Direct Cost of Virtual Function Calls in {C++}",
  crossref =	 "oopsla96",
  pages =	 "306--323",
  annote =	 "Studies the cost of virtual function calls on modern
		  processors, taking into account the effects of
		  out-of-order execution and caches. On their baseline
		  architecture, the standard implementation of virtual
		  function calls take 1\%--10\% of the instructions
		  and 2\%--29\% of the cycles.  The thunk
		  implementation is slightly faster, for most
		  benchmarks, and much faster for a few. The relative
		  cost of virtual function calls will increase
		  slightly in the future. The influences of
		  architectural variations like branch misprediction
		  penalties, branch prediction accuracy, issue widths,
		  and load latency. The cost per dispatch is 2--5
		  cycles for most benchmarks (10 cycles for one
		  benchmark) on their baseline architecture (4-wide
		  processor, 2 cycle load latency, 4 cycle branch
		  latency)."
}

@InProceedings{bacon&sweeney96,
  author = 	 "David F. Bacon and Peter F. Sweeney",
  title = 	 "Fast Static Analysis of C++ Virtual Function Calls",
  crossref =	 "oopsla96",
  pages =	 "324--341",
  annote =	 "An empirical study of the effectiveness of
		  \emph{unique name} analysis, \emph{class
		  hierarchy analysis} and \emph{rapid type analysis}. Rapid
		  type analysis is an analysis that takes into account
		  which classes are actually instantiated. It is very
		  fast and detects a a significant percentage of
		  monomorphic calls for some benchmarks (up to 100\%)."
}

@Proceedings{oopsla96,
  title = 	 "Conference on Object-Oriented Programming Systems,
		  Languages \& Applications (OOPSLA '96)",
  booktitle = 	 "Conference on Object-Oriented Programming Systems,
		  Languages \& Applications (OOPSLA '96)",
  year = 	 "1996",
  key =		 "OOPSLA '96",
}

@InProceedings{kessler96,
  author = 	 {Christoph W. Ke{\3}ler},
  title = 	 {Scheduling Expression DAGs for Minimal Register Need},
  booktitle = 	 {Programming Languages: Implementations, Logics, and
                  Programs (PLILP'96)},
  series =	 {LNCS 1140},
  year =	 {1996},
  publisher =	 {Springer},
  pages =	 {228--242},
  annote =	 {A dynamic programming algorithm for basic-block
                  scheduling for register allocation.}
}

@Article{steenkiste&hennessy89,
  author = 	 {Peter A. Steenkiste and John L. Hennessy},
  title = 	 {A Simple Interprocedural Register Allocation
                  Algorithm and Its Effectiveness for {LISP}},
  journal = 	 toplas,
  year = 	 {1989},
  volume =	 {11},
  number =	 {1},
  month =	 jan,
  pages =	 {1--32}
}

@Misc{sharnoff&robenalt,
  author =	 {David Muir Sharnoff and Steven Allen Robenalt},
  title =	 {Catalog of compilers, interpreters, and other
                  language tools},
  howpublished = {http://www.idiom.com/free-compilers}
}

@Article{ivanco&hunter90,
  author = 	 {Tyler A. Ivanco and Geoffry Hunter},
  title = 	 {A User Definable Language Interface for {Forth}},
  journal = 	 jfar,
  year = 	 {1990},
  volume =	 {6},
  number =	 {1}
}

@Article{rodriguez&poehlman96,
  author = 	 {Bradford J. Rodriguez and W. F. S. Poehlman},
  title = 	 {A Survey of Object-Oriented {Forths}},
  journal = 	 sigplan,
  year = 	 {1996},
  month =	 apr,
  pages =	 {39--42},
  url =		 {http://www.zetetics.com/bj/papers/oofs.htm}
}

@Article{mckewan97,
  author = 	 {Andrew McKewan},
  title = 	 {Object-Oriented Programming in {ANS Forth}},
  journal = 	 {Forth Dimensions},
  year = 	 {1997},
  month =	 mar
}

@InProceedings{gough97,
  author = 	 {K. John Gough},
  title = 	 {Multi-Language, Multi-Target Compiler Development:
                  Evolution of the Gardens Point Compiler Project},
  crossref =	 {jmlc97},
  pages =	 {17--40},
  annote =	 {Describes a relatively recent compiler framework
                  that uses a stack-based intermediate code (Dcode).}
}

@Proceedings{jmlc97,
  title = 	 {Modular Programming Languages (JMLC '97)},
  booktitle = 	 {Modular Programming Languages (JMLC '97)},
  year = 	 {1997},
  key =		 {JMLC'97},
  volume =	 {1204},
  series =	 {LNCS},
  publisher =	 {Springer}
}

@Article{moore87,
  author = 	 {Charles Moore},
  title = 	 {Forth -- eine pers{\"o}nliche Sprache},
  journal = 	 {Vierte Dimension},
  year = 	 {1987},
  volume =	 {3},
  number =	 {3},
  month =	 oct,
  pages =	 {11--13},
  note =	 {Translated into German by Klaus Schleisiek, the original
                  is probably in \emph{More on NC4000}}
}

@Book{stroustroup94,
  author = 	 {Bjarne Stroustroup},
  title = 	 {The Design and Evolution of {C++}},
  publisher = 	 {Addison-Wesley},
  year = 	 {1994},
  annote =	 {An extended version of \cite{stroustroup93}.
                  Discusses the design philosophy of C++ and the
                  specific design issues in detail.}
}

@Article{oconnor&tremblay97,
  author = 	 {J. Michael O'Connor and Marc Tremblay},
  title = 	 {PicoJava-I: The {Java} Virtual Maschine in Hardware},
  journal = 	 {IEEE Micro},
  year = 	 {1997},
  month =	 mar,
  pages =	 {45--53},
  annote =	 {The PicoJava has a 4-stage single-issue pipeline, a
                  64-entry stack cache, and can execute some load
                  instructions together with compute instructions.}
}

@Article{christie96,
  author = 	 {Dave Christie},
  title = 	 {Developing the {AMD-K5} Architecture},
  journal = 	 {IEEE Micro},
  year = 	 {1996},
  month =	 apr,
  pages =	 {16--26}
}

@Article{kessler&rauber95,
  author = 	 {Christoph W. Ke{\ss}ler and Thomas Rauber},
  title = 	 {Generating Optimal Contiguous Evaluations for
                  Expression {DAG}s},
  journal = 	 {Computer Languages},
  year = 	 {1995},
  volume =	 {21},
  number =	 {2},
  pages =	 {113--127}
}

@InProceedings{aiken&nicolau88esop,
  author = 	 {Alexander Aiken and Alexandru Nicolau},
  title = 	 {Perfect Pipelining},
  booktitle = 	 {European Symposion on Programming (ESOP '88)},
  volume =	 {300},
  series =	 {LNCS},
  year =	 {1988},
  publisher =	 {Springer},
  url =		 {http://theory.stanford.edu/~aiken/publications/papers/esop88.ps},
  pages =	 {221--234}
}

@InProceedings{aiken&nicolau88pldi,
  author = 	 {Alexander Aiken and Alexandru Nicolau},
  title = 	 {Optimal Loop Parallelization},
  booktitle = 	 {SIGPLAN '88 Conference on Programming Language
                  Design and Implementation},
  year =	 {1988},
  pages =	 {308--317}
}

@Article{wirth88,
  author = 	 {Niklaus Wirth},
  title = 	 {From {Modula} to {Oberon}},
  journal = 	 spe,
  year = 	 {1988},
  volume =	 {18},
  number =	 {7},
  month =	 jul,
  pages =	 {661--670}
}

@Book{gabriel96,
  author = 	 {Richard P. Gabriel},
  title = 	 {Patterns of Software},
  publisher = 	 {Oxford University Press},
  year = 	 {1996},
  annote = 	 {A collection of essays on various topics, including
                  Christopher Alexander, languages, an autobiography,
                  and the story of Lucid, Inc.}
}

@Article{zsoter96,
  author = 	 {Andr{\'a}s Zs{\'o}ter},
  title = 	 {Does Late Binding Have to be Slow?},
  journal = 	 {Forth Dimensions},
  year = 	 {1996},
  volume =	 {18},
  number =	 {1},
  pages =	 {31--35},
  url =		 {http://www.forth.org/oopf.html}
}

@Article{paysan94,
  author = 	 {Bernd Paysan},
  title = 	 {Object Oriented {bigFORTH}},
  journal = 	 {Vierte Dimension},
  year = 	 {1994},
  volume =	 {10},
  number =	 {2},
  note =	 {An implementation in ANS Forth is available at
                  http://www.jwdt.com/~paysan/oof.zip}
}

@Book{pountain87,
  author = 	 {Dick Pountain},
  title = 	 {Object-Oriented {Forth}},
  publisher = 	 {Academic Press, London},
  year = 	 {1987}
}

@Book{krishnamurthy95,
  title = 	 "Practical Reusable {UNIX} Software",
  publisher = 	 "John Wiley \& Sons",
  year = 	 1995,
  editor =	 "Balachander Krischnamurthy"
}

@Article{balachandran+90,
  author = 	 {A. Balachandran and D. M. Dhamdhere and S. Biswas},
  title = 	 {Efficient Retargetable Code Generation Using
                  Bottom-Up Tree Pattern Matching},
  journal = 	 {Computer Languages},
  year = 	 {1990},
  volume =	 {15},
  number =	 {3},
  pages =	 {127--140}
}

@InProceedings{nair&hopkins97,
  author = 	 {Ravi Nair and Martin E. Hopkins},
  title = 	 {Exploiting Instruction Level Parallelism in
                  Processors by Caching Scheduled Groups},
  crossref =	 {isca97},
  pages =	 {13--25},
  annote =	 {Proposes a microarchitecture consisting of a simple,
                  slow engine and a parallel engine. The simple, slow
                  engine is used for the execution of traces not
                  contained in the DIF (Dynamic Instruction Formatting)
                  cache and for hard problems (exceptions, memory
                  disambiguation mispredicitions); a VLIW-like
                  parallel engine is used for quickly executing code
                  in the DIF cache under the usual
                  circumstances. Evaluates this idea with simulations
                  and presents some interesting results; in particular,
                  already a small DIF cache (256--1024 entries) provides
                  good results.}
}

@InProceedings{ebcioglu&altman97,
  author = 	 {Kemal Ebcio\u{g}lu and Erik Altman},
  title = 	 {{DAISY}: Dynamic Compilation for 100\% Architectural
                  Compatibility},
  crossref =	 {isca97},
  pages =	 {26--37},
  annote =	 {Exploits significant amounts of instruction-level
                  parallelism by using a VLIW, and translating code
                  for existing architectures pagewise using a fast
                  scheduler. The VLIW has special hardware support for
                  this scheme.}
}

@InProceedings{hakura&gupta97,
  author = 	 {Ziyad S. Hakura and Anoop Gupta},
  title = 	 {The Design and Analysis of a Cache Architecture for
                  Texture Mapping},
  crossref =	 {isca97},
  pages =	 {108--120},
  annote =	 {Uses a cache for the texture memory in MIP-mapped
                  texture mapping and analyses the performance. A
                  cache of 16K gives good performance, reducing the
                  memory bandwidth required by 3--15 times, especially when
                  the texture mapping is performed in a tiled order.}
}

@InProceedings{wilson&olukotun97,
  author = 	 {Kenneth M. Wilson and Kunle Olukotun},
  title = 	 {Designing High Bandwidth On-Chip Caches},
  crossref =	 {isca97},
  pages =	 {121--132},
  annote =	 {Varies the cache size, organization, pipeline
                  depth, and models the resulting cycle time, IPC, and
                  overall execution time.}
}

@InProceedings{moshovos+97,
  author = 	 {Andreas Moshovos and Scott E. Breach and
                  T.N. Vijaykumar and Gurindar S. Sohi},
  title = 	 {Dynamic Speculation and Synchrinization of Data Dependences},
  crossref =	 {isca97},
  pages =	 {181--193},
  annote =	 {Shows that blindly speculating on the independence
                  of memory accesses can hurt performance and proposes
                  a solution for the problem.}
}

@InProceedings{sodani&sohi97,
  author = 	 {Avinash Sodani and Gurindar S. Sohi},
  title = 	 {Dynamic Instruction Reuse},
  crossref =	 {isca97},
  pages =	 {194--205},
  annote =	 {Evaluates various ways to reuse the results of
                  instructions that were executed with the same
                  arguments. Such instructions arise from throwing
                  away instructions upon branch misprediction, whether
                  the instrcutions depended on the branch or not; such
                  instructions also arise from executing code several
                  times with (partially) the same arguments. Several
                  schemes for reuse are proposed, some depending on
                  the data flow, some on the actual values. A
                  significant percentage of the instructions is reused
                  with the most aggressive scheme, resulting in
                  speedups of 4\%--15\% (harmonic mean over all
                  benchmarks) for the various schemes.}
}

@InProceedings{palacharla+97,
  author = 	 {Subbarao Palacharla and Norman P. Jouppi and J. E. Smith},
  title = 	 {Complexity-Effective Superscalar Processors},
  crossref =	 {isca97},
  pages =	 {206--218},
  annote =	 {Estimates the cycle time of certain critical
                  (non-pipelinable) components of an OOO superscalar
                  processor at verious feature sizes and for various
                  degrees of superscalarity. For a 8-issue superscalar
                  at 0.18$\mu$ the critical components are the bypass
                  logic and the wakeup and select logic. They then
                  propose a microarchitecture that avoids this
                  bottleneck: they partition the functional units into
                  two 4-issue clusters (with a one-cycle dalay for
                  intercluster bypassing to avoid the bypass bottleneck
                  and schedule the instructions into FIFOs of
                  (perferably) dependent instructions to avoid the wakeup and
                  select bottleneck. These changes have a small
                  negative effect on the IPC, but a large positive
                  effect on the (potential) cycle time, resulting in
                  an average improvement of 16\% in speed.}
}

@InProceedings{chang+97,
  author = 	 {Po-Yung Chang and Eric Hao and Yale N. Patt},
  title = 	 {Target Prediction for Indirect Jumps},
  crossref =	 {isca97},
  pages =	 {274--283},
  annote =	 {Apply the idea of two-level branch prediction to
                  predicting the targets of indirect jumps (i.e., it
                  adds (conditional) branch history to the target
                  address for accessing the target cache. For the
                  SPECint95 benchmarks involving many indirect jumps,
                  perl and gcc, this results in a prediction accuracy
                  of 93\% and 63\% and an execution time improvement
                  of 14\% and 5\%.}
}

@InProceedings{sprangle+97,
  author = 	 {Eric Sprangle and Robert S. Chappell and Mitch Alsup
                  and Yale N. Patt},
  title = 	 {The Agree Predictor: A Mechanism for Reducing
                  Negative Branch History Interference},
  crossref =	 {isca97},
  pages =	 {284--291},
  annote =	 {Reduces the number of conflict mispredictions by
                  having the predictor entries predict whether or not
                  some other predictor (say, a static predictor) is
                  correct. This increases the chance that the
                  predicted direction is correct in case of a
                  conflict.}
}

@Proceedings{isca97,
  title = 	 "$24^\textit{th}$ Annual International Symposium on Computer Architecture",
  booktitle = 	 "$24^\textit{th}$ Annual International Symposium on Computer Architecture",
  year = 	 "1997",
  key =		 "ISCA '24",
}

@Article{fraser&henry91,
  author = 	 {Christopher W. Fraser and Robert R. Henry},
  title = 	 {Hard-Coding Bottom-Up Code Generation Tables to Save
                  Time and Space},
  journal = 	 spe,
  year = 	 {1991},
  volume =	 {21},
  number =	 {1},
  month =	 jan,
  pages =	 {1--12},
  annote =	 {Describes how to optimize tree parsing
                  automata. Some of these optimizations nowadays appear to
                  trade too much time or complexity for space, but
                  others save both time and space.}
}

@InProceedings{deutsch&schiffman84,
  author = 	 {L. Peter Deutsch and Allen M. Schiffman},
  title = 	 {Efficient Implementation of the {Smalltalk-80} System},
  booktitle = 	 {Principles of Programming Languages (POPL'84)},
  year =	 {1984},
  pages =	 {297--302}
}

@InProceedings{proebsting97,
  author = 	 "Todd A. Proebsting",
  title = 	 "Simple Translation of Goal-Directed Evaluation",
  crossref =	 "sigplan97",
  pages =	 "1--6",
  annote =	 "Presents a method for translating expressions
		  involving backtracking into simple goto-based code,
		  without creating choicepoints. However, it does not
		  discuss (in depth) how to translate backtracking
		  procedures."
}

@InProceedings{collberg97,
  author = 	 "Christian S. Collberg",
  title = 	 "Reverse Interpretation $+$ Mutation Analysis =
		  Automatic Retargeting",
  crossref =	 "sigplan97",
  pages =	 "57--70",
  annote =	 "Finds out how to generate code for an architecture
		  by automatic reverse engineering of the code
		  generator of a C compiler. In particular, the system uses
		  the C compiler to generate assembly language; it
		  feeds a large number of relatively simple C programs
		  to the C compiler. First it discovers the
		  assembler's syntax (this only works for conventional
		  syntaces), then the meaning of the instructions
		  (using \emph{mutation analysis}, which analyses the
		  differences in the output (if any) for slightly
		  varying inputs), and generates a BEG code generator
		  from this. The system has been tested on the integer
		  instruction set of five register machines."
}

@InProceedings{ammons+97,
  author = 	 "Glenn Ammons and Thomas Ball and James R. Larus",
  title = 	 "Exploiting Hardware Performance Counters with Flow
		  and Context Sensitive Profiling",
  crossref =	 "sigplan97",
  pages =	 "85--96",
  annote =	 "Introduces a clever way to do efficient profiling
		  over (acyclic) paths. It also introduces
		  context-sensitive profiling, which is based on the
		  calling context tree, a slightly more accurate
		  representation of the calling behaviour than the
		  call graph; it allows to accurately attribute
		  metrics to callers (in the absence of
		  recursion). Also presents some results of
		  measurements for various metrics (e.g., cache
		  misses)."
}

@InProceedings{clinger&hansen97,
  author = 	 "William D. Clinger and Lars T. Hansen",
  title = 	 "Generational Garbage Collection and the Radioactive
		  Decay Model",
  crossref =	 "sigplan97",
  pages =	 "97--108",
  annote =	 "Shows that generational garbage collection can help
		  even if the objects' life expectancy is independent
		  of their age (i.e., the death rate is constant):
		  Just collect a part that has not being collected for
		  the longest time. The paper also presents some data
		  on the life-time of objects in real programs."
}

@InProceedings{poletto+97,
  author = 	 "Massimiliano Poletto and Dawson R. Engler and
		  M. Frans Kaashoek",
  title = 	 "\textsf{tcc}: A System for Fast, Flexible, and
		  High-Level Dynamic Code Generation",
  crossref =	 "sigplan97",
  pages =	 "109--121",
  annote =	 "Introduces `C, an extension of ANSI C for dynamic
		  code generation, and tcc, a compiler for `C (current
		  targets: SPARC and MIPS). Also presents some results."
}

@InProceedings{goodwin97,
  author = 	 "David W. Goodwin",
  title = 	 "Interprocedural Dataflow Analysis in an Executable
		  Optimizer",
  crossref =	 "sigplan97",
  pages =	 "122--133",
  annote =	 "Describes a fast interprocedural optimizer that
		  works on executables. It first summarizes register
		  information for procedures, then uses that
		  information for various optimizations."
}

@InProceedings{ayers+97,
  author = 	 "Andrew Ayers and Robert Gottlieb and Richard
		  Schooler",
  title = 	 "Aggressive Inlining",
  crossref =	 "sigplan97",
  pages =	 "134--145",
  annote =	 "Discusses the inlining and cloning performed by HP's
		  compiler. These components are controlled by giving
		  them a budget (a compile-time-increase by 100\%),
		  which they can use up in several passes, inlining
		  only the most promising candidates. Cloning alone
		  has little effect, inlining has a significant effect
		  (factor 1.3 on SPECint95), and the combination is
		  slightly better."
}

@InProceedings{hashemi+97,
  author = 	 "Amir H. Hashemi and David R. Kaeli and Brad Calder",
  title = 	 "Efficient Procedure Mapping Using Cache Line Coloring",
  crossref =	 "sigplan97",
  pages =	 "171--182",
  annote =	 "Another method for reducing I-cache misses."
}

@InProceedings{eichenberger&davidson97,
  author = 	 "Alexandre E. Eichenberger and Edward S. Davidson",
  title = 	 "Efficient Formulation for Optimal Modulo Schedulers",
  crossref =	 "sigplan97",
  pages =	 "194--205"
}

@InProceedings{sperber&thiemann97,
  author = 	 "Michael Sperber and Peter Thiemann",
  title = 	 "Two for the Price of One: Composing Partial
		  Evaluation and Compilation",
  crossref =	 "sigplan97",
  pages =	 "215--225"
}

@InProceedings{bergner+97,
  author = 	 "Peter Bergner and Peter Dahl and David Engebretsen
		  and Matthew O'Keefe",
  title = 	 "Spill Code Minimization via Interference Region
		  Spilling",
  crossref =	 "sigplan97",
  pages =	 "287--295",
  annote =	 "Adds interference region spilling (similar to live
		  range splitting) to Chaitin-style allocators. The
		  results are encouraging."
}

@InProceedings{lueh&gross97,
  author = 	 "Guei-Yuan Lueh and Thomas Gross",
  title = 	 "Call-Cost Directed Register Allocation",
  crossref =	 "sigplan97",
  pages =	 "296--307",
  annote =	 "This paper contains enough stuff for several papers:
		  It introduces an improvement in register assignment
		  that differentiates between caller-saved and
		  callee-saved registers, and in some cases (where it
		  is beneficial) rather spills a live range than
		  assigning a register of the wrong class; the results
		  are good. The paper
		  also contains an empirical evaluation of various
		  design options for graph-colouring register
		  allocators. "
}

@InProceedings{ernst+97,
  author = 	 "Jens Ernst and William Evans and Christopher
		  W. Fraser and Steven Lucco and Todd A. Proebsting",
  title = 	 "Code Compression",
  crossref =	 "sigplan97",
  pages =	 "358--365",
  annote =	 "Proposes two forms of compressed code based on lcc's
		  intermediate language: the \emph{wire code} cannot
		  be interpreted directly and is somewhat (0\%--25\%)
		  smaller than gzipped SPARC code; one of the reasons
		  for this is that they separate instructions and
		  immediate data to help compression algorithms. They
		  also describe an interpretable code called BRISC
		  which uses \emph{operand specialization} and
		  \emph{opcode combination} to be compact. BRISC code
		  size is similar to gzipped Pentium code, it's
		  interpreter is about 12 times slower than native
		  code, and code generated by a JIT is about 2.5 times
		  slower than native code."
}

@Proceedings{sigplan97,
  booktitle = 	 "SIGPLAN '97 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '97 Conference on Programming Language
		  Design and Implementation",
  year = 	 "1997",
  key = 	 "PLDI '97"
}

@Book{woehr92,
  author = 	 "Jack Woehr",
  title = 	 "Forth: The New Model",
  publisher = 	 "M\&T Publishing/Prentice-Hall",
  year = 	 "1992"
}

@Book{kelly&spies86,
  author = 	 "Mahlon G. Kelly and Nicholas Spies",
  title = 	 "FORTH: A Text and a Reference",
  publisher = 	 "Prentice-Hall",
  year = 	 "1986",
  note =	 "Available from Miller Microcomputer Services, 61
		  Lake Shore Road, Natick, MA, USA"
}

@InProceedings{grove+97,
  author =	 "David Grove and Greg DeFouw and Jeffrey Dean and
                  Craig Chambers",
  title =	 "Call Graph Construction in Object-Oriented
                  Languages",
  crossref =	 "oopsla97",
  pages =	 "108--124"
}

@InProceedings{zendra+97,
  author =	 "Olivier Zendra and Dominique Colnet and Suzanne
                  Collin",
  title =	 "Efficient Dynamic Dispatch without Virtual Function
                  Tables. The {SmallEiffel} Compiler.",
  crossref =	 "oopsla97",
  pages =	 "125--141",
  url =		 "http://smalleiffel.loria.fr/papers/oopsla97.ps.gz",
  html-part-url ="http://www.elj.com/elj-win32/ooplsa97-se-paper.html",
  annote =	 {The SmallEiffel Compiler does not use virtual
                  function tables. Instead, it represents types with
                  integers and uses binary search (coded as an
                  if-tree) to find the right method for a
                  selector. The compiler uses type analysis to
                  determine the possible receiver types at each call
                  site. The compiler generates a dispatch function for
                  each (used) set of receiver types for a
                  selector. The compiler recompiles the whole program
                  every time, but is fast enough (5000 lines/s
                  Eiffel-to-C on a Pentium Pro 200) to make this
                  practical; the C compiler then recompiles only the
                  changed parts. The paper presents some empirical
                  results that supposedly show that the binary search
                  outperforms the VFT approach for an unpredictable
                  trimorphic call, and still performs better on some
                  machines for a megamorphic (50 types) call. However,
                  the paper assumes that rotating the targets in a
                  fixed order makes them unpredictable, but that is
                  not true in the presence of history-pattern-based
                  branch predictors (e.g., PentiumPro), so don't put
                  too much faith in the results of the polymorphic
                  call benchmarks.}
}

@InProceedings{vitek+97,
  author = 	 "Jan Vitek and R. Nigel Horspool and Andreas Krall",
  title = 	 "Efficient Type Inclusion Tests",
  crossref =	 "oopsla97",
  pages =	 "142--157"
}

@Proceedings{oopsla97,
  title = 	 "Conference on Object-Oriented Programming Systems,
		  Languages \& Applications (OOPSLA '97)",
  booktitle = 	 "Conference on Object-Oriented Programming Systems,
		  Languages \& Applications (OOPSLA '97)",
  year = 	 "1997",
  key =		 "OOPSLA '97",
}

@Book{adobe90,
  author = 	 {{Adobe Systems Incorporated}},
  title = 	 {PostScript Language --- Reference Manual},
  publisher = 	 {Addison-Wesley},
  year = 	 1990,
  edition =	 {second}
}

@Book{adobe86blau,
  author = 	 {{Adobe Systems Incorporated}},
  title = 	 {PostScript Language --- Tutorial and Cookbook},
  publisher = 	 {Addison-Wesley},
  year = 	 1988
}

@Book{adobe88gruen,
  author = 	 {{Adobe Systems Incorporated}},
  title = 	 {PostScript Language --- Program Design},
  publisher = 	 {Addison-Wesley},
  year = 	 1988
}

@Book{brodie84,
  author = 	 {Leo Brodie},
  title = 	 {Thinking Forth},
  publisher = 	 {Fig Leaf Press (Forth Interest Group)},
  year = 	 1984,
  address =	 {100 Dolores St, Suite 183, Carmel, CA 93923, USA}
}

@InProceedings{brouwer+98,
  author = 	 "Klaus Brouwer and Wolfgang Gellerich and Erhard Ploederer",
  title = 	 "Myths and Facts about the Efficient Implementation
		  of Finite Automata and Lexical Analysis",
  crossref =	 "cc98",
  pages =	 "1--15",
  annote =	 "Performed some measurements on Ada scanners
		  implemented with Aflex (the Ada version of flex),
		  REX, and several techniques imlemented in
		  hand-written scanners. The measurements included
		  run-time on various machines, data and instruction
		  cache misses on qa Pentium, the effects of using
		  optimization in the compiler. The most surprising
		  result (for me) was the huge difference between the
		  generated and the hand-written scanners (a factor of
		  74 between the fastest scanner and the
		  Aflex-generated one on a slow Sun). There was little
		  difference between the various hand-written
		  scanners. The paper also discusses the reasons for
		  the low performance of the generated scanners (mainly
		  because they are table-driven) and suggests some
		  improvements."
}

@InProceedings{johnstone&scott98,
  author = 	 "Adrian Johnstone and Elizabeth Scott",
  title = 	 "Generalized Recursive-Descent Parsing and
		  Follow-Determinism",
  crossref =	 "cc98",
  pages =	 "16--30",
  annote =	 "Generalized Recursive-Descent parsing can handle all
		  grammars that are not left-recursive (even ambiguous
		  ones), but potentially requires exponential time; on
		  LL(1) grammars it is linear. The technique is quite
		  simple: each parsing function builds a set of the
		  ends of possible matches. The paper also introduces a
		  property \emph{follow-determinism} of grammars,
		  which makes it possible to have linear-complexity
		  recursive-descent parsers for more grammars than
		  before (e.g., some grammars that cannot be
		  left-factored, and some non-LR grammars). A
		  generator GRDP implementing these techniques is
		  available."
}

@InProceedings{boyland98,
  author = 	 "John Tang Boyland",
  title = 	 "Analysing Direct Non-local Dependencies in Attribute
		  Grammars",
  crossref =	 "cc98",
  pages =	 "31--49",
  annote =	 "Presents an extension to attribute grammars that
		  allows to have rules dealing with parts of larger
		  data structures (e.g., parts of symbol table
		  entries), with correct execution order etc. The
		  main part of the paper discusses how to analyse them at
		  compiler-generation time."
}

@InProceedings{knoop+98cc,
  author = 	 "Jens Knoop and Dirk Kosch{\"u}tzki and Bernhard Steffen",
  title = 	 "Basic-Block Graphs: Living Dinosaurs?",
  crossref =	 "cc98",
  pages =	 "63--79",
  annote =	 "Makes a case for single-instruction, edge-labeled
		  data-flow graphs for data-flow analysis, mainly
		  because they are simpler to analyse. Unfortunately
		  the timings given in the paper are wrong (source:
		  discussion with one of the authors)."
}

@InProceedings{martin+98,
  author = 	 "Florian Martin and Martin Alt and Reinhard Wilhelm
		  and Christian Ferdinand",
  title = 	 "Analysis of Loops",
  crossref =	 "cc98",
  pages =	 "80--94",
  annote =	 "The context for this paper is data flow analysis of
		  existing binary programs to find out the worst-case
		  execution time (including cache effects). They
		  improve their analysis by performing loop peeling
		  during the analysis. This is somehow related to
		  interprocedural analysis."
}

@InProceedings{lapkowski&hendren98,
  author = 	 "Christopher Lapkowski and Laurie J. Hendren",
  title = 	 "Extended SSA Numbering: Introducing SSA Properties
		  to Languages with Multi-level Pointers",
  crossref =	 "cc98",
  pages =	 "128--143",
  annote =	 "Introduces a representation for data flow analysis
		  that allows some pointer analysis. It is based on
		  SSA numbering (similar to SSA form without $\phi$
		  nodes), but uses two numbers per pointer."
}

@InProceedings{kennedy+98,
  author = 	 "Robert Kennedy and Fred Chow and Peter Dahl and
		  Shin-Ming Liu and Raymond Lo and Mark Streich",
  title = 	 "Strength Reduction via SSAPRE",
  crossref =	 "cc98",
  pages =	 "144--158",
  annote =	 "Discusses how to do strength reduction in the SSAPRE
		  framework (partial redundancy elimination based on
		  SSA form). The advantages of this approach are that
		  it is more powerful than bit-vector based techniques
		  (it can handle situations in one pass that would
		  require a repetition of PRE and strength reduction
		  in the bit-vector approach). OTOH, it also has some
		  disadvantages through the absence of global
		  information: in particular, linear function test
		  replacement does not know all the replacement
		  candidates, so it may choose a suboptimal
		  replacement."
}

@InProceedings{ghiya+98,
  author = 	 "Rakesh Ghiya and Laurie J. Hendren and Yingchun Zhu",
  title = 	 "Detecting Parallelism in C Programs with Recursive
		  Data Structures",
  crossref =	 "cc98",
  pages =	 "159--173"
}

@InProceedings{cooper&simpson98,
  author = 	 "Keith D. Cooper and L. Taylor Simpson",
  title = 	 "Live Range Splitting in a Graph Coloring Register Allocator",
  crossref =	 "cc98",
  pages =	 "174--187",
  annote =	 "This splitting approach uses the containment graph,
		  a directed form of the interference graph: there is
		  a directed edge from a live range $b$ to a live
		  range $a$ if $b$ is not live at any definition or
		  use of $a$ ($a$ contains $b$), but $a$ and $b$
		  conflict; in that case $a$ can be \emph{split
		  around} $b$, and both can occupy the same
		  register. The splitting method presented in the
		  paper is passive, splitting only if the node would
		  be spilled, and if splitting appears to be cheaper
		  than spilling. The paper presents some empirical
		  results, that show an improvement over the plain
		  spilling approach. I find them not very convincing, due to
		  the fact that they only use splitting where it
		  appears better than spilling; in this setting only a
		  technique that is dominated by the old technique
		  (i.e., that is never better) would not show an
		  improvement."
}

@InProceedings{gupta98,
  author = 	 "Rajiv Gupta",
  title = 	 "A Code Motion Framework for Global Instruction Scheduling",
  crossref =	 "cc98",
  pages =	 "219--233",
  annote =	 "Presents a global instruction scheduling method
		  based on moving single instructions into delay
		  slots, somewhat like percolation scheduling and the
		  approach of \cite{bernstein&rodeh91}. No empirical
		  evaluation is presented."
}

@InProceedings{stuempel+98,
  author = 	 "Esther St{\"u}mpel and Michael Thies and Uwe Kastens",
  title = 	 "{VLIW} Compilation Techniques for Superscalar
		  Architectures",
  crossref =	 "cc98",
  pages =	 "234--248",
  annote =	 "Models the PPC~604 as VLIW processor (which
		  presented some interesting problems) and used a
		  compiler that uses various scheduling techniques on
		  it. Presents empirical results."
}

@InProceedings{engelbrecht&kourie98,
  author = 	 "R. L. Engelbrech and D. G. Kourie",
  title = 	 "Issues in Translating Smalltalk to Java",
  crossref =	 "cc98",
  pages =	 "249--263",
  annote =	 "Presents some of the challenges in translating
		  Smalltalk to Java, and how to deal with them; in
		  particular: in Smalltalk every selector can be sent
		  to any object; Smalltalks variables can have any
		  type; Smalltalk's class methods have no equivalent
		  in Java."
}

@InProceedings{steindl98,
  author = 	 "Christoph Steindl",
  title = 	 "Intermodular Slicing of Object-Oriented Programs",
  crossref =	 "cc98",
  pages =	 "264--278",
  annote =	 "Presents a slicer for Oberon programs. It uses a
		  combination of a conservative approach and user
		  feeback to deal with the problem of function
		  pointers that is present at every method
		  invocation. The slicer has been implemented and is
		  practical, typically producing a slice in one
		  second."
}

@InProceedings{petterson98,
  author = 	 "Mikael Petterson",
  title = 	 "Portable Debugging and Profiling",
  crossref =	 "cc98",
  pages =	 "279--293",
  annote =	 "This paper deals with the problem of producing a
		  portable debugger for a language implementation that
		  achieves portability by translating into C. The
		  approach taken in the paper is to add debugging code
		  to the C program, that's quite similar to what a C
		  compiler with debugging turned on inserts into the
		  machine program; the effects are also similar: 22\%
		  slowdown and 300\% code growth (from the talk, the
		  numbers in the paper are different); this does not
		  give information on variable values to the
		  debugger. This approach is also applied to profiling."
}

@InProceedings{leino&nelson98,
  author = 	 "K. Rustan M. Leino and Greg Nelson",
  title = 	 "An Extended Static Checker for Modula-3",
  crossref =	 "cc98",
  pages =	 "302--305",
  annote =	 "The checker works by taking a program (usually
		  extended with some annotations), generating logical
		  formulas from that, and leaving it to a theorem
		  prover to check them. It does not prove the
		  correctness of the program, but can find some
		  errors."
}

@Proceedings{cc98,
  title = 	 "Compiler Construction (CC'98)",
  booktitle = 	 "Compiler Construction (CC'98)",
  year = 	 "1998",
  key =		 "CC'98",
  editor =	 "Kai Koskimies",
  OPTvolume =	 "1383",
  OPTseries =	 "LNCS",
  publisher =	 "Springer LNCS~1383",
  address =	 "Lisbon"
}

@InProceedings{smolka98,
  author = 	 "Gert Smolka",
  title = 	 "Concurrent Constraint Programming Based on
		  Functional Programming",
  crossref =	 "esop98",
  pages =	 "1--11",
  annote =	 "Adds promises and futures to ML, which allows doing
		  things like logic variables, coroutining, and
		  possibly attributed variables (meta-structures) in
		  Prolog."
}

@InProceedings{chin+98,
  author = 	 "Wei-Ngan Chin and Siau-Cheng Khoo and Tat-Wee Lee",
  title = 	 "Synchronisation Analysis to Stop Tupling",
  crossref =	 "esop98",
  pages =	 "75--89",
  annote =	 "Tupling groups calls with common arguments together
		  , so their multiple results can be computed
		  simultaneously (in some sense it is a functional
		  equivalent of loop fusion); it's a powerful
		  transformation, allowing reductions in
		  complexity. The proplem is, as usual, ensuring that
		  the transformation stops. The paper introduces
		  \emph{synchronization analysis} for indicating when
		  tupling can be safely applied."
}

@InProceedings{jay&steckler98,
  author = 	 "C. B. Jay and P. A. Steckler",
  title = 	 "The Functional Imperative: Shape!",
  crossref =	 "esop98",
  pages =	 "139--153",
  annote =	 "They compile \textsf{FiSh}, a functional language for array
		  computation into very efficient code by using
		  \emph{shape analysis}, a kind of type analysis that
		  includes array sizes. Another, probably equally
		  important reason for the efficiency is that the
		  compiler performs full inlining of everything (in
		  particular, higher-order functions). They present
		  empirical results showing a huge speedup over the
		  Glasgow Haskell compiler, and speedups of 1.25--7
		  over ocamlopt."
}

@InProceedings{knoop+98esop,
  author = 	 "Jens Knoop and Oliver R{\"u}thing and Bernhard Steffen",
  title = 	 "Code Motion and Code Placement: Just Synonyms?",
  crossref =	 "esop98",
  pages =	 "154--169",
  annote =	 "Discusses some variations in partial redundancy
		  elimination algorithms. Maybe a good starting point
		  for this topic."
}

@InProceedings{ross&sagiv98,
  author = 	 "John L. Ross and Mooly Sagiv",
  title = 	 "Building a Bridge between Pointer Aliases and
		  Program Dependences",
  crossref =	 "esop98",
  pages =	 "221--235",
  annote =	 "Presents a way to map the may-alias problem (are two
		  references possible aliases at one point?) to the
		  program dependence problem (does a program point
		  possibly depend on another?)."
}

@InProceedings{scherlis98,
  author = 	 "William L. Scherlis",
  title = 	 "Systematic Change of Data Representation: Program
		  Manipulations and a Case Study",
  crossref =	 "esop98",
  pages =	 "252--266",
  annote =	 "Presents three class manipulation transformations:
		  \emph{Shift} moves a common computation from all
		  wrapping (constructor) sites to all unwrapping sites
		  or vice versa, possibly accomplishing representation
		  changes. \emph{Project} and \emph{idempotency} do
		  the obvious thing. The paper illustrates these
		  techniques by deriving the Java \texttt{String} and
		  \texttt{StringBuffer} classes from a C-like string
		  class through program transformations."
}

@Proceedings{esop98,
  title = 	 "Programming Languages and Systems (ESOP'98)",
  booktitle = 	 "Programming Languages and Systems (ESOP'98)",
  year = 	 "1998",
  key =		 "ESOP'98",
  OPTvolume = 	 "1381",
  OPTseries = 	 "LNCS",
  publisher =	 "Springer LNCS~1381",
  address =	 "Lisbon"
}

@Article{cooper+98,
  author = 	 "Keith D. Cooper and Timothy J. Harvey and Linda Torczon",
  title = 	 "How to Build an Interference Graph",
  journal =	 spe,
  year =	 "1998",
  volume =	 "28",
  number =	 "4",
  pages =	 "425--444",
  url =		 "http://softlib.cs.rice.edu/MSCP/papers/hash.ps.gz",
  annote =	 "Empirically compares the bit-matrix representation
		  of interference graphs to a hash-table
		  representation. Also presents some
		  improvements to the classical bit-matrix approach: they use
		  separate graphs for integer and FP registers; and
		  they don't use an extra pass to determine the sizes
		  of the edge lists, but use an extensible edge-list
		  representation. The empirical results are for graphs
		  for FORTRAN functions with up to 5936 nodes and up
		  to 723605 edges; the split bit-matrix is the best
		  method both speedwise and in size for most
		  graphs; the best hashing method beats the split
		  bit-matrix sizewise in one case and speedwise in
		  three cases (out of 169). The authors admit that
		  for even larger graphs the hash table would be more
		  beneficial."
}

@InProceedings{appelbe+98,
  author = 	 "Bill Appelbe and Raja Das and Reid Harmon",
  title = 	 "Future Branches -- Beyond Speculative Execution",
  crossref =	 "acac98",
  pages =	 "1--13",
  annote =	 "Future branches instruction are a modern variant
		  of branch delay slots: The branch contains the
		  address where it will take effect, allowing the
		  branch to move up a considerable distance and
		  reducing branch prediction miss penalties. To deal
		  with misspeculation, there is also an \emph{undo
		  future branch} instruction. The
		  implementation they propose uses a fully associative
		  \emph{pending branch table}, which is managed as a
		  queue, and whose minimum size is programmer-visible
		  (the compiler must avoid too many pending branches)."
}

@InProceedings{biglari-abhari+98,
  author = 	 "Morteza Biglari-Abhari and Michael J. Liebelt and
		  Kamran Eshraghian",
  title = 	 "Implementing a {VLIW} Compiler: Motivation and Trade-offs",
  crossref =	 "acac98",
  pages =	 "37--46"
}

@InProceedings{channon&koch98,
  author = 	 "David Channon and David Koch",
  title = 	 "A Study of Sparse 2-Dimensional Translation
		  Lookaside Buffers",
  crossref =	 "acac98",
  pages =	 "47--56",
  annote =	 "Compares various TLB implementations for very large
		  address spaces."
}

@InProceedings{littin+98,
  author = 	 "Richard H. Littin and J. A. David McWha and Murray
		  W. Pearson and John G. Cleary",
  title = 	 "Block Based Execution and Task Level Parallelism",
  crossref =	 "acac98",
  pages =	 "57--66",
  annote =	 "Proposes and evaluates a machine with an instruction
		  set consisting of fixed-length basic blocks, that
		  differentiates between intra-basic-block data flow
		  and inter-basic-block data flow, similar to
		  the multiscalar approach \cite{sohi+95}."
}

@InProceedings{omondi98,
  author = 	 "Amos R. Omondi",
  title = 	 "Fast Floating-Point Addition Without Operand Conversion",
  crossref =	 "acac98",
  pages =	 "145--155",
  annote =	 "Presents an FP adder that does addition of the
		  mantissae in sign-magnitude representation instead
		  of converting it to two's complement representation."
}

@InProceedings{siemers&moeller98,
  author = 	 "Christian Siemers and Dietmar P. F. M{\"o}ller",
  title = 	 "The $>$S$<$puter: A Novel Microarchitecture Model
		  for Execution inside Superscalar and {VLIW} Processors
		  Using Reconfigurable Hardware",
  crossref =	 "acac98",
  pages =	 "169--178",
  annote =	 "The microarchitecture presented here contains a
		  number of functional units with reconfigurable
		  interconnects. The decoder deals with a
		  basic block at a time and configures the FU network for
		  the instructions in the basic block."
}

@InProceedings{szyperski+98,
  author = 	 "Clemens Szyperski and Paul Roe and Siu Yuen Chan and
		  Geoff Elgey",
  title = 	 "Garden's Autobahn: Efficient and Safe Streaming of
		  Data Structures for High Performance Communication
		  Architectures",
  crossref =	 "acac98",
  pages =	 "193--203",
  annote =	 "Presents a combination of language feature, compiler
		  technique and hardware for transmitting irregular
		  data structures over a network, with many desirable
		  properties, such as efficiency."
}

@InProceedings{zhu&wong98,
  author = 	 "Y Zhu and W. F. Wong",
  title = 	 "The Effect of Instruction Dependency on Superscalar
		  Processor Performance",
  crossref =	 "acac98",
  pages =	 "215--226",
  annote =	 "Presents a theoretical model for predicting
		  performance for a class of superscalar machines;
		  this allows running a benchmark once (to determine
		  the benchmark characteristics important for the
		  model), and predicting the performance of several
		  microarchitectures. They also present some empirical
		  data on the accuracy of their results (-19\%--14\%
		  error for the IPC predictions)."
}

@Proceedings{acac98,
  title = 	 "Computer Architecture 98 (ACAC '98)",
  booktitle = 	 "Computer Architecture 98 (ACAC '98)",
  year = 	 "1998",
  key =		 "ACAC '98",
  editor =	 "John Morris",
  volume =	 "20",
  number =	 "4",
  series =	 "Australian Computer Science Communications",
  publisher =	 "Springer",
  address =	 "Perth"
}

@Article{bhamidipaty&proebsting98,
  author = 	 "Achyutram Bhamidipaty and Todd A. Proebsting",
  title = 	 "Very Fast YACC-Compatible Parsers (For Very Little Effort)",
  journal =	 spe,
  year =	 "1998",
  volume =	 "28",
  number =	 "2",
  pages =	 "181-190",
  annote =	 "Presents a simple approach to generate hard-coded
		  parsers, once you have an LALR automaton. They give
		  special care to being completely yacc-compatible,
		  including error handling. The resulting parsers are
		  2--6 times faster than yacc-generated parsers
		  (bison-generated parsers are slightly slower still),
		  but consume more memory (up to five times as large
		  for gcc's parser, that is 75KB)."
}

@Article{boehm&weiser88,
  author = 	 "Hans-Juergen Boehm and Mark Weiser",
  title = 	 "Garbage Collection in an Uncooperative Environment",
  journal =	 spe,
  year =	 "1988",
  volume =	 "18",
  number =	 "9",
  pages =	 "807--820",
  annote =	 "One of the first papers on conservative garbage
		  collection. Their freelist implementation allows
		  them to find out quickly whether a piece of memory
		  is managed by their garbage collector, and whether a
		  value is a valid pointer to the start of an
		  object. They report experiences in the context of
		  their Russel implementation, for replacing the
		  normal allocator in two large C programs, and for using
		  the technique in debugging (to find leaks and
		  premature frees)."
}

@Article{wentworth90,
  author = 	 "E. P. Wentworth",
  title = 	 "Pitfalls of Conservative Garbage Collection",
  journal =	 spe,
  year =	 "1990",
  volume =	 "20",
  number =	 "7",
  pages =	 "719--727",
  annote =	 "Does some measurements on space leakage of
		  conservative garbage collectors due to pointer
		  misidentification, in a 16-bit address space. LISP
		  programs exhibit a pretty constant leakage overhead,
		  which is acceptable. OTOH, some KRC programs exhibit
		  unbounded leakage, due to the use of lazy lists:
		  if some list element is referenced by a spurious
		  pointer, the whole list (growing into infinity)
		  starting at that element will be retained."
}

@Article{zorn93,
  author = 	 "Benjamin Zorn",
  title = 	 "The Measured Cost of Conservative Garbage Collection",
  journal =	 spe,
  year =	 "1993",
  volume =	 "23",
  number =	 "7",
  pages =	 "733-756",
  techreport-url = "ftp://ftp.cs.colorado.edu/pub/cs/techreports/zorn/CU-CS-573-92.ps.Z",
  annote =	 "Presents empirical data on the performance of four
		  malloc/free implementations and the Boehm-Weiser
		  garbage collector on six C programs. The
		  Boehm-Weiser collector was competetive speedwise,
		  but required up to 2.5 times as much
		  memory as the most space-efficient malloc/free
		  library. Consequently, it does not perform as well
		  as the others for certain physical memory
		  sizes. Four of the programs contain additional
		  memory managers, but their use generally does not
		  help space or time-wise; in several cases, there is
		  even a slowdown; choosing a good general-purpose
		  allocator seems more worthwhile. The Boehm-Weiser
		  collector had a bad effect on cache performance,
		  probably due to it's freelist design and the effects
		  of the marking pass."
}

@Article{nevill-manning+98,
  author = 	 "Craig G. Nevill-Manning and Todd Reed and Ian H. Witten",
  title = 	 "Extracting Text from Postscript",
  journal =	 spe,
  year =	 "1998",
  volume =	 "28",
  number =	 "5",
  pages =	 "481--491",
  annote =	 "Uses a distiller-like technique to extract plain
		  text from postscript. Word breaks and character
		  breaks are recognized by the spacing. No technique
		  for extracting more structural information is
		  presented."
}

@Article{allen+98,
  author = 	 "Vicki Allan and Steven J. Beatty and Bogong Su and
		  Philip H. Sweany",
  title = 	 "Building a Retargetable Local Instruction Scheduler",
  journal =	 spe,
  year =	 "1998",
  volume =	 "28",
  number =	 "3",
  pages =	 "249--283",
  annote =	 "Discusses some topics in instruction scheduling.
		  They present a complex timing model, explore some
		  variations of scheduling algorithms:
		  instruction-driven vs. operation-driven list
		  scheduling, foresight and lookahead to avoid
		  scheduling failures (possible with their timing
		  model) and different scheduling directions. They
		  discuss the integration of scheduling and register
		  allocation. In their experimental evaluation,
		  operation-driven scheduling looks better than
		  instruction-driven scheduling, and the decision
		  between forward and reverse scheduling depends on
		  the architecture (they suggest taking the better of
		  both). They also evaluate heuristics: optimizing a
		  linear polynomial of 24 heuristics with a genetic
		  algorithm beats the best of 2500 randomly generated
		  polynomial by 5\%; however, I wonder if this speedup
		  resulted from optimizing for their test
		  cases, and if it would also show up for other test
		  cases. Some questionable results (e.g., that
		  straight list scheduling produced scheduling
		  failures for the Alpha architecture) throw doubt on
		  the whole paper."
}

@Article{norris&pollock98,
  author = 	 "Cindy Norris and Lori L. Pollock",
  title = 	 "The Design and Implementation of RAP: A PDG-based
		  Register Allocator",
  journal =	 spe,
  year =	 "1998",
  volume =	 "28",
  number =	 "4",
  pages =	 "401--424",
  annote =	 "Presents a hierarchical register allocation
		  algorithm that differs from
		  \cite{callahan&koblenz91} in the following: It uses
		  the PDG's regions instead of single-entry
		  single-exit tiles; and it does everything on a
		  bottom-up pass instead of delaying register
		  assignment until a top-down pass. It does not
		  exploit the PDG's partial ordering of
		  instructions. The empirical results (for small
		  benchmarks on register-starved machines) show a
		  slow-down of 1\%--4\% in the generated code compared
		  to a comparable non-hierarchical register
		  allocator. The main reason seems to be the way spill
		  code is inserted."
}

@Article{shaw88,
  author =	 {George W. Shaw},
  title =	 {Forth Shifts Gears},
  journal =	 {Computer Language},
  year =	 {1988},
  pages =	 {67--75 (May), 61--65 (June)},
  annote =	 {Discusses multiple code fields, their application
                  for implementing \texttt{value} and \texttt{defer},
                  and how they are implemented. Also discusses
                  state-smart words and how to use MCFs to avoid
                  them.}
}

@Book{papadimitriou&steiglitz82,
  title =        "Combinatorial Optimization, Algorithms and
                 Complexity.",
  author =       "Christos H. Papadimitriou and Kenneth Steiglitz",
  publisher =    "Prentice-Hall",
  address =      "Englewood Cliffs, NJ",
  year =         "1982",
}

@Misc{paysan98,
  author =	 {Bernd Paysan},
  title =	 {Re: State-smart etc   Was: Re: Facelifting my Forth
},
  howpublished = {Usenet newsgroup comp.lang.forth, message 351B70D4.F31@\linebreak[0]remove.muenchen.this.org.junk},
  month =	 mar,
  year =	 {1998},
  annote =	 {Describes how to implement \texttt{interpret/compile:}.}
}

@Misc{bradley96,
  author =	 {Mitch Bradley},
  title =	 {{Re: Another solution for RFIs 8 and 9}},
  howpublished = {Message 9609231729.AA06128@FirmWorks.COM to the mailing list ansforth@minerva.com},
  year =	 {1996},
  month =	 sep
}

@Misc{gforth,
  key =		 {Gforth},
  title =	 {Gforth home page},
  howpublished = {http://www.complang.\linebreak[0]tuwien.ac.at/forth/gforth/}
}

@Misc{proebsting98,
  author =	 {Todd Proebsting},
  title =	 {Least-Cost Instruction Selection in {DAG}s is {NP}-Complete},
  howpublished = {http://research.microsoft.com/\~{}toddpro/papers/\linebreak[0]proof.htm},
  url =		 {http://research.microsoft.com/~toddpro/papers/proof.htm},
  year =	 {1998}
}

@InProceedings{eichenberger+1995,
  author =       "Alexandre E. Eichenberger and Edward S. Davidson and
                 Santosh G. Abraham",
  booktitle =    "Proceedings of the 1995 International Conference on
                 Supercomputing",
  title =        "Optimum Modulo Schedules for Minimum Register
                 Requirements",
  year =         "1995",
  url =          "http://www.eecs.umich.edu/PPP/ICS95.ps",
  keywords =     "Software pipelining, Register sensitive modulo
                 scheduling, instruction level paralelism, VLIW,
                 Superscalar",
  month =        jul,
  pages =        "31--40"
}

@ARTICLE{moon&ebcioglu97,
	AUTHOR      = {S. Moon and K. Ebcioglu},
	TITLE       = {Parallelizing Nonnumerical Code with Selective Scheduling
                       and Software Pipelining},
	JOURNAL     = {ACM Transactions on Programming Languages and Systems},
	VOLUME      = 19,
	NUMBER      = 6,
	PAGES       = {853--898},
	MONTH       = {November},
	YEAR        = 1997}

@InProceedings{bodik+98,
  author =	 {Rastislav Bod\'ik and Rajiv Gupta and Mary Lou
                  Soffa},
  title =	 {Complete Removal of Redundant Expressions},
  crossref =	 {sigplan98},
  pages =	 {1--14},
  annote =	 {Partial redundancy elimination sometimes is hindered
                  by control flow joins. This paper explores code
                  replication to eliminate this problem, but tries to
                  limit code growth by replicating only those blocks
                  necessary, or by profile-guided \emph{selective
                  restructuring} or speculation. These improvements
                  allowed the removal of up to 2\% of the dynamically
                  executed instructions (but they indicate that this
                  disappointing result may be an artifact of the basic
                  compiler used).}
}

@InProceedings{lo+98,
  author =	 {Raymond Lo and Fred Chow and Robert Kennedy and
                  Shin-Ming Liu and Peng Tu},
  title =	 {Register Promotion by Sparse Partial Redundancy
                  Elimination of Loads and Stores},
  crossref =	 {sigplan98},
  pages =	 {26--37},
  annote =	 {Register promotion is performed by performing
                  partial redundancy elimination of the loads and
                  stores of the values to be promoted. For loads, this
                  is the same as normal partial redundancy elimination
                  and can be perfomed in the SSAPRE framework. For
                  stores, the paper introduces a dual of SSAPRE,
                  SSUPRE (U=use). The improvement through PRE of
                  stores is small, however (only 1.2\% of all stores
                  and 4.2\% of the redundant stores are eliminated);
                  apparently dead and faint store elimination
                  eliminates most redundancy before that. PRE of loads
                  works well, however, eliminating 25.6\% of all
                  loads. Various forms of speculation have only a
                  small (and not always positive) effect on the number
                  of loads and stores executed.}
}

@InProceedings{ammons&larus98,
  author = 	 {Glenn Ammons and James R. Larus},
  title = 	 {Improving Data-Flow Analysis with Path Profiles},
  crossref =	 {sigplan98},
  pages =	 {72--84},
  annote =	 {Improves data-flow analysis by extending the
                  control-flow graph into a hot-path graph, performing
                  data-flow analysis on this graph, then eliminating
                  unnecessary dupicates. The paper evaluates the
                  technique by applying it to constant propagation,
                  resulting in speedups of -4.4\%--9.8\%.}
}

@InProceedings{diwan+98,
  author =	 {Amer Diwan and Kathryn S. McKinley and J. Eliot
                  B. Moss},
  title =	 {Type-Based Alias Analysis},
  crossref =	 {sigplan98},
  pages =	 {106--117},
  annote =	 {An empirical evaluation of three type-based alias
                  analyses for Modula-3. The simplest analysis is
                  quite imprecise using the established metric
                  (average number of intraprocedural aliases of a
                  reference), but the more sophisticated analyses
                  (based on taking field accesses into account; and
                  based on a flow-insensitive analysis of variable
                  assignments and references) are much better. In the
                  bottom line, for (fully) redundant load elimination,
                  they all perform about equally well, resulting in
                  speedups of 1\%--8\%. They also compare their
                  analysis to an upper bound and conclude that at most
                  2.5\% of the remaining loads could be eliminated by
                  more precise analysis.}
}

@InProceedings{sastry+98,
  author =	 {S. Subramanya Sastry and Subbarao Placharla and
                  James E. Smith},
  title =	 {Exploiting Idle Floating-Point Resources for Integer
                  Execution},
  crossref =	 {sigplan98},
  pages =	 {118--129},
  annote =	 {They propose adding integer operations to
                  floating-point units (like MMX was added). They
                  evaluate a compilation technique that simply puts
                  all integer operations in the FP unit that do not
                  have to be in the integer unit because they are
                  needed to compute addresses, or due to calling
                  conventions; i.e., store-value slices and branch
                  slices are moved into the FP unit. An advanced
                  scheme inserts inter-partition copies and duplicates
                  some instructions to increase the possibility for
                  offloading. These methods offload 5.7\%--29.5\%
                  (basic scheme) and 8.3\%--41.6\% (advanced scheme)
                  of the integer instructions for SPECint95 into the
                  FP unit, resulting in speedups of 1.6\%--20.8\%
                  (2.5\%--23.1\%) on a $2+2$-issue
                  machine and 0.8\%--17.1\% (1.0\%--18.0\%) on a
                  $4+4$-issue machine. Interestingly, this aggressive
                  approach had little effect on the SPECfp benchmarks,
                  except for ear, where 18\% of the integer
                  instructions could be offloaded, resulting in 18\%
                  speedup on a $2+2$-issue machine.}
}

@InProceedings{traub+98,
  author =	 {Omri Traub and Glenn Holloway and Michael D. Smith},
  title =	 {Quality and Speed in Linear-Scan Register
                  Allocation},
  crossref =	 {sigplan98},
  pages =	 {142--151},
  annote =	 {Presents a register allocation method called
                  binpacking, where the program is scanned from start
                  to end, and registers are allocated when needed
                  (with the usual problems at control-flow joins). The
                  paper presents an improvement called second-chance
                  binpacking, which is similar to live range splitting
                  for graph coloring allocators. An empirical
                  comparison of second-chance bin-packing with a graph
                  coloring register allocator shows an increase of
                  0\%--8.6\% in executed instructions and
                  -3.4\%--8.2\% slowdown in run-time for some SPEC
                  benchmarks. Binpacking is advertized as fast, but
                  the register allocation times presented are not
                  impressive (slower than graph colouring for small
                  functions).}
}

@InProceedings{cheng+98,
  author =	 {Perry Cheng and Robert Harper and Peter Lee},
  title =	 {Generational Stack Collection and Profile-Driven
                  Pretenuring},
  crossref =	 {sigplan98},
  pages =	 {162--173},
  annote =	 {Generational stack collection reduces the time
                  needed for scanning the stack for roots by applying
                  generational techniques; this is based on the fact
                  that the deeper regions of the stack often do not
                  change between two collections. Profile-driven
                  pretenuring identifies allocation sites that usually
                  produces long-lived data, and tenures all data
                  allocated by these sites immediately, thus
                  eliminating a lot of copying.}
}

@InProceedings{clinger98,
  author =	 {William D. Clinger},
  title =	 {Proper Tail Recursion and Space Efficiency},
  crossref =	 {sigplan98},
  pages =	 {174--185},
  annote =	 {A theoretical specification of \emph{proper tail
                  recursion}.}
}

@InProceedings{wickline+98,
  author = 	 {Philip Wickline and Peter Lee and Frank Pfenning},
  title = 	 {Run-time Code Generation and Modal-ML},
  crossref =	 {sigplan98},
  pages =	 {224--235},
  annote =	 {Presents a variant of ML with special constructs for
                  run-time code generation (staging), and how it is
                  translated into an abstract machine called
                  CCAM.  Hard to read.}
}

@InProceedings{xi&pfenning98,
  author =	 {Hongwei Xi and Frank Pfenning},
  title =	 {Eliminating Array Bound Checking Through Dependent
                  Types},
  crossref =	 {sigplan98},
  pages =	 {249--257},
  annote =	 {Extends ML with yet another static type constructor
                  that, when used well by the programmer, allows the
                  compiler to eliminate many array bound checks.}
}

@InProceedings{bacon+98,
  author = 	 {David F. Bacon and Ravi Konuru and Chet Murthy and
                  Mauricio Serrano},
  title = 	 {Thin Locks: Featherweight Synchronization for Java},
  crossref =	 {sigplan98},
  pages =	 {258--268},
  annote =	 {They optimiza locking in Java by having thin locks
                  in the objects that cover the frequent cases, with
                  an escape to fat locks that are in separate tables.}
}

@InProceedings{adl-tabatabai+98,
  author =	 {Ali-Reza Adl-Tabatabai and Micha\l Cierniak and
                  Guei-Yuan Lueh and Vishesh M. Parikh and James
                  M. Stichnoth},
  title =	 {Fast, Effective Code Generation in a Just-In-Time
                  Java Compiler},
  crossref =	 {sigplan98},
  pages =	 {280--290},
  annote =	 {Presents some techniques that are used in Intel's
                  JIT for IA32. \emph{Lazy code selection} uses a
                  simulated stack that contains objects representing
                  various addressing modes; essentially an application
                  of treeless tree parsing code selection to
                  RAFTS. \emph{Common subexpression elimination}
                  compares sequences of byte codes, and only tries to
                  find expressions common with expressions present in
                  registers. Register allocation gives four registers
                  to local variables, and three registers plus any
                  free variable registers to local register
                  allocation. A very simple array bound checking
                  optimization is implemented that can eliminate
                  bounds checking for some accesses with constant
                  indexes. Exception code is moved out-of-line. They
                  also describe how they deal with garbage
                  collection. Experiments show that the most important
                  optimization is register allocation, the others have
                  little influence. The compile time is 0.5--1s for
                  the benchmarks used.}
}

@InProceedings{piumarta&riccardi98,
  author =	 {Ian Piumarta and Fabio Riccardi},
  title =	 {Optimizing Direct Threaded Code by Selective
                  Inlining},
  crossref =	 {sigplan98},
  pages =	 {291--300},
  url =          {ftp://ftp.inria.fr/INRIA/Projects/SOR/papers/1998/ODCSI_pldi98.ps.gz},
  annote =	 {They reduce the overhead of a direct threaded
                  interpreter by combining all the VM instructions in
                  a basic block into a single virtual machine
                  instruction. This is done by simply concatenating
                  the machine code for the virtual machine
                  instructions (except for the Next code). Multiple
                  instances of the same sequence are just translated
                  once, and then reused. They evaluated this technique
                  empirically in the context of a fine-grained
                  RISC-like VM, and in an Objective Caml
                  interpreter. The speedup over plain direct threaded
                  code for the RISC-like VM is a factor
                  1.33--2.06. For the Caml VM the speedups varied
                  with benchmark and processor, from 1 to 2.2. The
                  code expansion ranges from 2.2--4 for the Sparc,
                  with larger benchmarks having less expansion (due to
                  more reuse of sequences). Implementing this
                  technique on the Caml VM took only one day.}
}

@InProceedings{ayers+98,
  author =	 {Andrew Ayers and Stuart de Jong and John Peyton and
                  Richard Schooler},
  title =	 {Scalable Cross-Module Optimization},
  crossref =	 {sigplan98},
  pages =	 {301--312},
  annote =	 {Optimization takes too much memory (1.7KB per source
                  line) to allow straight-forward whole-program
                  optimization in main memory. This paper describes
                  various forms of compacting the intermediate
                  representations, up to offloading the IR to
                  disk. This reduces the memory consumption for
                  optimizing 126.gcc from 250MB to 25MB, while
                  increasing optimization time from 18~min to
                  30~min. The paper also presents other interesting
                  issues in whole-program optimization, like speedups
                  for SPECint95 benchmarks (1.1--2.25) and huge CAD
                  applications (1.4--1.7), dealing with bugs uncovered
                  by optimization, and build compatibility.}
}

@InProceedings{gay&aiken98,
  author =	 {David Gay and Alex Aiken},
  title =	 {Memory Management with Explicit Regions},
  crossref =	 {sigplan98},
  pages =	 {313--323},
  annote =	 {Does an empirical evaluation of
                  region/arena/zone-based memory allocation, and
                  proposes and evaluates a safe version of this
                  technique, based on reference counting (references
                  into a whole region).}
}

@InProceedings{sweeney&tip98,
  author = 	 {Peter F. Sweeney and Frank Tip},
  title = 	 {A Study of Dead Data Members in C++ Applications},
  crossref =	 {sigplan98},
  pages =	 {324--332},
  annote =	 {On average, 12.5\% of the data members are dead,
                  taking up 4.4\% of the space.}
}

@InProceedings{necula&lee98,
  author = 	 {George C. Necula and Peter Lee},
  title = 	 {The Design and Implementation of a Certifying Compiler},
  crossref =	 {sigplan98},
  pages =	 {333-344},
  annote =	 {The use an optimizing compiler in combination with
                  an assembly language certifyer.}
}

@Proceedings{sigplan98,
  booktitle = 	 "SIGPLAN '98 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '98 Conference on Programming Language
		  Design and Implementation",
  year = 	 "1998",
  key = 	 "PLDI '98"
}

@Book{rubinstein88,
  author =	 {Richard Rubinstein},
  title =	 {Digital Typography},
  publisher =	 {Addison-Wesley},
  year =	 {1988},
  annote =	 {Discusses many issues involved in typography and the
                  problems of dealing with it automatically. Includes
                  an annotated bibliography.}
}

@InProceedings{park&moon98,
  author =	 {Jinpyo Park and Soo-Mook Moon},
  title =	 {Optimistic Register Coalescing},
  booktitle =	 {Parallel Architectures and Compilation Techniques
                  (PACT '98)},
  pages =	 {196--204},
  year =	 {1998}
}

@InProceedings{simons98,
  author =	 {Anthony J. H. Simons},
  title =	 {Borrow, Copy or Steal? Loans and Larceny in the
                  Orthodox Canonical Form},
  crossref =	 {oopsla98},
  pages =	 {65--83},
  annote =	 {Gives a good discussion of dealing with memory
                  (de)allocation and change semantics in C++ (and
                  other languages without automatic storage
                  reclamation). The standard solution is to copy
                  everywhere. This paper presents borrowing
                  (copy-on-write), and stealing, a more complex
                  scheme, where the user of the object ensures that
                  the reference to the object is dead, and therefore
                  no copying is necessary. The techniques are useful
                  in different circumstances.}
}

@InProceedings{litvinov98,
  author =	 {Vassily Litvinov},
  title =	 {Constraint-Based Polymorphism in Cecil: Towards a
                  Practical and Static Type System},
  crossref =	 {oopsla98},
  pages =	 {388--411},
  annote =	 {Presents a type system for Cecil (and other
                  object-oriented programming languages), and uses it
                  for statically type-checking Vortex, a 100000 line
                  Cecil program. Apart from adding type annotations,
                  very little code needed to be rewritten. But the
                  static typechecker also found only few bugs.}
}

@Proceedings{oopsla98,
  title = 	 "Conference on Object-Oriented Programming Systems,
		  Languages \& Applications (OOPSLA '98)",
  booktitle = 	 "Conference on Object-Oriented Programming Systems,
		  Languages \& Applications (OOPSLA '98)",
  year = 	 "1998",
  key =		 "OOPSLA '98",
}

@InProceedings{seidl&zorn98,
  author =	 {Matthew L. Seidl and Benjamin G. Zorn},
  title =	 {Segragating Heap Objects by Reference Behaviour and
                  Lifetime},
  crossref =	 {asplos98},
  pages =	 {12--23},
  annote =	 {Uses a profile-feedback method to predict, whether
                  an object is highly reference, not highly
                  referenced, short-lived or other. Several predictors
                  were used (plain allocation site is not sufficient),
                  in particular call path and stack content. The stack
                  content predictor did quite well for some programs
                  and some VM sizes, but overall I was not really
                  convinced.}
}

@InProceedings{sodani&sohi98,
  author =	 {Avinash Sodani and Gurindar S. Sohi},
  title =	 {An Empirical Analysis of Instruction Repetition},
  crossref =	 {asplos98},
  pages =	 {35--45},
  annote =	 {Previous papers have observed that many dynamically
                  executed instructions have the same inputs (and thus
                  the same outputs) as earlier instances of the same
                  instruction. This paper analyses where these
                  repetitions come from; it not only considers
                  instructions where all instances have the same
                  arguments, but uses a repetition buffer with up to
                  1000 instances. The results are: Very few static
                  instructions cause most of the dynamic repetition
                  (except for m88ksim); 56\%--99\% of the dynamic
                  instructions are repetitions; 5\%--40\% of the
                  repetitions are due to instructions with a single
                  instance. The paper uses data flow analysis on
                  traces to determine where the repeated inputs come
                  from, globally: external input, global initialized
                  data, program internals (immediate and
                  immediate-derived, e.g., loop counters),
                  uninitialized data. Internals dominate (52\%--86\%
                  of repeated instructions, followed by global init
                  (14\%--30\%) and external (0\%--30\%); uninit plays
                  a minor role ($<$1\%). The paper also does an
                  analysis that takes functions into account, in
                  particular argument repetition: 59\%--98\% of
                  dynamic function calls are repetitions of a call
                  with all arguments same; Only 0\%--16\% of the
                  dynamic calls do not have a single repeated
                  argument. The paper breaks down the instruction
                  repetitions in terms of their role in the
                  function. One interesting result peripheral to the
                  paper is that the function prologue and epilogue
                  code takes 2\%--25\% of the executed
                  instructions. It also comments on the possibility of
                  exploiting the repetitions in software, but is not
                  very encouraging. It also comments on hardware
                  exploitation, which could capture 30\%--74\% of all
                  dynamic instructions, but does not look very useful
                  to me.}
}

@InProceedings{lee+98,
  author =	 {Walter Lee and Rajeev Barua and Matthew Frank and
                  Devabhaktuni Srikrishna and Jonathan Babb and Vivek
                  Sarkar and Saman Amarasinghe},
  title =	 {Space-Time Scheduling of Instruction-Level
                  Parallelism on a Raw Machine},
  crossref =	 {asplos98},
  pages =	 {46--57},
  annote =	 {The Raw machine consists of a number of R2000-based
                  tiles, each of which is a complete processor. The
                  execution model used is called NURA (non-uniform
                  register access), which allows to exploit
                  instruction-level parallelism. The paper describes
                  and evaluates the scheduler for this
                  architecture. One of the interesting results is that
                  the multiple pcs (allowing partially asynchronous
                  operation) make the performance much less vulnerable
                  to effects that are not statically predictable.}
}

@InProceedings{hammond+98,
  author =	 {Lance Hammond and Mark Willey and Kunle Olukotun},
  title =	 {Data Speculation Support for a Chip Multiprocessor},
  crossref =	 {asplos98},
  pages =	 {58--69},
  annote =	 {The machine architecture here uses parallelism among
                  small threads, with all communication happening
                  through memory, and some hardware support for
                  speculation and synchronization.}
}

@InProceedings{machanick+98,
  author =	 {Philip Machanick and Pierre Salverda and Lance
                  Pompe},
  title =	 {Hardware-Software Trade-Offs in a Direct Rambus
                  Implementation of the RAMpage Memory Hierarchy},
  crossref =	 {asplos98},
  pages =	 {105--114},
  annote =	 {Proposes and evaluates using software-controlled
                  paging between SRAM and DRAM instead of using
                  hardware-controlled cache policies. That would allow
                  getting rid of the tags of the last cache level, and
                  using this chip area for other purposes, among other
                  things. The evaluation shows that managing DRAM with
                  paging is competetive.}
}

@InProceedings{roth+98,
  author =	 {Amir Roth and Andreas Moshovos and Gurindar S. Sohi},
  title =	 {Dependence Based Prefetching for Linked Data
                  Structures},
  crossref =	 {asplos98},
  pages =	 {115--126},
  annote =	 {Proposes and evaluates a hardware mechanism that
                  records access patterns and prefetches memory
                  locations that will probably be needed by
                  pointer-chasing code.}
}

@InProceedings{le98,
  author =	 {Bich C. Le},
  title =	 {An Out-of-Order Execution Technique for Runtime
                  Binary Translators},
  crossref =	 {asplos98},
  pages =	 {151--158},
  annote =	 {The paper attacks the problem of dealing with
                  exceptions in a binary translated, scheduled
                  program. It uses a checkpointing approach. The task
                  is particularly simple because the paper has a 1:1
                  mapping of architectural registers of the emulated
                  machine to registers of the executing machine, and
                  the executing machine has extra registers. Each
                  superblock start is a checkpoint, with all values in
                  the emulated architectural registers. One
                  performance problem with this compiler-based
                  reordering is false exceptions (exceptions
                  speculatively taken by the translated code but not
                  by the emulated code); this problem is solved by
                  retranslating the superblock without scheduling
                  after one occurence of a false exception.}
}

@InProceedings{stark+98,
  author = 	 {Jared Stark and Marius Evers and Yale N. Patt},
  title = 	 {Variable Length Path Branch Prediction},
  crossref =	 {asplos98},
  pages =	 {170--179}
}

@InProceedings{temam98,
  author =	 {Olivier Temam},
  title =	 {Investigating Optimal Local Memory Performance},
  crossref =	 {asplos98},
  pages =	 {218--227},
  url =		 {http://www.lri.fr/~temam/Articles/old-Te98.ps.gz},
  annote =	 {Presents an optimal algorithm (with foresight) for
                  exploiting both space and time locality and uses it
                  for evaluating areas where the current caching
                  policies could benefit. One of the conclusions is
                  that the meat in improving caches is in improving
                  the replacement policy because many of the words
                  contained in a cache are dead and can be replaced
                  without causing a miss. A surprising result is that
                  the usual way of determining cache lines gives
                  better results than methods that appear less
                  arbitrary.}
}

@InProceedings{citron+98,
  author =	 {Daniel Citron and Dror Feitelson and Larry Rudolph},
  title =	 {Accelerating Multi-Media Processing by Implementing
                  Memoing in Multiplication and Division Units},
  crossref =	 {asplos98},
  pages =	 {252--261}
}

@InProceedings{fu+98,
  author =	 {Chao-ying Fu and Matthew D. Jennings and Sergei
                  Y. Larin and Thomas M. Conte},
  title =	 {Value Speculation Scheduling for High Performance
                  Processors},
  crossref =	 {asplos98},
  pages =	 {262--271},
  annote =	 {Combines hardware for predicting results of
                  instructions with a scheduling technique like
                  run-time disambiguation.}
}

@InProceedings{ranganathan&franklin98,
  author =	 {Narayan Ranganathan and Manoj Franklin},
  title =	 {An Empirical Study of Decentralized ILP Execution
                  Models},
  crossref =	 {asplos98},
  pages =	 {272--281},
  annote =	 {Compares different models of distributing
                  instructions among different units in a CPU:
                  execution unit based decentralization (EDD) is the
                  normal way of grouping instructions by functional
                  unit. Data dependence based decentralization (DDD)
                  groups instructions together that have a data
                  dependence relationship; Control Dependence based
                  decentralization (CDD) groups instructions that have
                  the same control dependences (the CPU/superblock
                  style hardware). The paper assumes that
                  inter-execution-unit communication happens in an
                  uni- or bidirectional ring (not very realistic for
                  DDD). The paper analyses how these models scale to
                  more execution units: EDD scales pretty badly. The
                  smarter DDD algorithm performs best, but loses
                  performance beyond 8 EUs (probably because of the
                  ring architecture), and is not worthwhile beyond 4
                  EUs. CDD has no such thrashing, but manages to catch
                  up with DDD's 4-EU performance only with 8--16 EUs.}
}

@InProceedings{schnarr&larus98,
  author =	 {Eric Schnarr and James R. Larus},
  title =	 {Fast Out-of-Order Processor Simulation Using
                  Memoization},
  crossref =	 {asplos98},
  pages =	 {283--294},
  annote =	 {The simulator described in the paper performs
                  cycle-accurate simulation by executing code
                  fragments directly for functional simulation,
                  intermixed with a memoizing computation of what
                  happens at the cycle level.  The result is a speedup
                  factor of 5--12.}
}

@InProceedings{jacob&mudge98,
  author =	 {Bruce L. Jacob and Trevor N. Mudge},
  title =	 {A Look at Several Memory Management Units,
                  TLB-Refill Mechanisms, and Page Table Organizations},
  crossref =	 {asplos98},
  pages =	 {295--306},
  annote =	 {Describes several MMU/TLB/Page Table Organizations
                  of various machines and their operating systems, and
                  evaluates and analyses the performance impact of the
                  various choices.}
}

@Proceedings{asplos98,
  title = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-VIII)",
  booktitle = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-VIII)",
  year = 	 "1998",
  key =		 "ASPLOS-VIII"
}

@Book{giampaolo99,
  author = 	 {Dominic Giampaolo},
  title = 	 {Practical File System Design},
  publisher = 	 {Morgan Kaufmann},
  year = 	 {1999},
  annote =	 {Gives a good overview of file system design issues,
                  with BFS (BeOS file system) as a running example; in
                  addition to the usual stuff, this book discusses
                  journaling, indexing (BFS supports indexes over
                  file attributes that are always up-to-date) and
                  APIs, as well as file system benchmarking and
                  testing.}
}

@InProceedings{visser+98,
  author =	 {Eelco Visser and Zine-elAbidine Benaissa and Andrew
                  Tolmach and},
  title =	 {Building Program Optimizers with Rewriting
                  Strategies},
  crossref =	 {icfp98},
  pages =	 {13--26},
  annote =	 {Uses a somewhat Prolog-like language for defining
                  rewriting strategies.}
}

@InProceedings{findler&flatt98,
  author =	 {Robert Bruce Findler and Matthew Flatt},
  title =	 {Modular Object-Oriented Programming with Units and
                  Mixins},
  crossref =	 {icfp98},
  pages =	 {94--104},
  annote =	 {Discusses how both the classes (mixins) and the
                  operations in a class can be extended in
                  MzScheme. An important concept here is the module
                  (unit). The paper explains the basic problem and
                  it's solution quite nicely.}
}

@InProceedings{finne+98,
  author = 	 {Sigbjorn Finne and Daan Leijen and Erik Mejer and
                  Simon Peyton Jones},
  title = 	 {H/Direct: A Binary Foreign Function Interface for Haskell},
  crossref =	 {icfp98},
  pages =	 {153--162},
  annote =	 {The interface is defined in IDL.}
}

@InProceedings{karczmarczuk98,
  author = 	 {Jerzy Karczmarczuk},
  title = 	 {Functional Differentiation of Computer Programs},
  crossref =	 {icfp98},
  pages =	 {195--203},
  annote =	 {Computational differentiation works, by computing
                  all needed derivatives of an operation along with
                  the operation; for a complex function, this computes
                  all needed derivatives of the function at one point
                  along with the function result.  This paper explores
                  how to do this in a functional context.}
}

@Proceedings{icfp98,
  title =	 {International Conference on Functional Programming
                  (ICFP '98)},
  booktitle =	 {International Conference on Functional Programming
                  (ICFP '98)},
  year =	 {1998},
  key =		 {ICFP '98},
  note =	 {SIGPLAN Notices 34(1) (1999)}
}

@Article{briggs+98,
  author =	 {Preston Briggs and Keith D. Cooper and Timothy
                  J. Harvey and L. Taylor Simpson},
  title =	 {Practical Improvements to the Construction and
                  Destruction of Static Single Assignment Form},
  journal =	 spe,
  year =	 {1998},
  volume =	 {28},
  number =	 {8},
  pages =	 {859--881},
  annote =	 {Presents several improvements to SSA construction
                  and destruction: 1) Semi-pruned SSA form has almost
                  as few $\phi$-nodes as the pruned form, but can be
                  built faster; however, the speedup is small,
                  especially compared to the time needed in an
                  optimization like value numbering. 2) An improvement
                  in stack manipulation when building SSA form. 3)
                  Dealing with the problems of converting the parallel
                  $\phi$-nodes into sequential copies, especially
                  after copy folding; one of the problems they attack
                  is register shuffling, but I don't find their
                  algorithm and their explanation of it very
                  impressive.}
}

@Article{yuen99,
  author = 	 {C. K. Yuen},
  title = 	 {Stack and RISC},
  journal = 	 can,
  year = 	 {1999},
  volume =	 {27},
  number =	 {1},
  month =	 mar,
  pages =	 {3--9},
  annote =	 {Discusses using out-of-order mechanisms for stack
                  machines (stack reorder buffer). One interesting
                  argument is that stack machines provide an
                  advantage, because with them the death of a value is
                  known immediately, so we do not need the writeback of
                  the value.}
}

@Article{postiff+99,
  author = 	 {Matthew A. Postiff and David A. Greene and Gary
                  S. Thyson and Trevor N. Mudge},
  title = 	 {The limits of Instruction Level Parallelism in
                  SPEC95 Applications},
  journal = 	 can,
  year = 	 {1999},
  volume =	 {217},
  number =	 {1},
  month =	 mar,
  pages =	 {31--34},
  annote =	 {Some interesting results: Stack pointer updates
                  are in the critical path of many benchmarks; there
                  is parallelism across millions of dynamically
                  executed instructions.}
}

@TechReport{steele77,
  author = 	 {Guy Lewis {Steele Jr.}},
  title = 	 {Debunking the ``Expensive Procedure Call'' Myth or
                  Procedure Call Implementations Considered Harmful or
                  Lambda: The Ultimate Goto},
  institution =  {MIT AI Lab},
  year = 	 {1977},
  type =	 {AI Memo},
  number =	 {443},
  month =	 oct,
  annote =	 {Takes a look at procedure calls, both from the
                  implementation side (advocating tail-call
                  optimization and caller-saved registers), and from
                  the programming language design and programming side
                  (in the context of the structured programming
                  debate).  One interesting aspect is that it advocates
                  dividing even quite simple push-and-jump variants
                  into several instructions (a very RISCy approach).}
}

@InProceedings{moessenboeck90,
  author =	 {Hanspeter {M\"ossenb\"ock}},
  title =	 {A Generator for Production-Quality Compilers},
  booktitle =	 {Compiler Compilers},
  pages =	 {48--61},
  year =	 {1990},
  volume =	 {477},
  series =	 {LNCS},
  publisher =	 {Springer},
  annote =	 {A description of Coco/R, with comments on the
                  history and the design decisions. About half of the
                  paper deals with error recovery.}
}

@Article{fleming&wallace86,
  author = 	 {Philip J. Fleming and John J. Wallace},
  title = 	 {How not to Lie with Statistics: The Correct Way to
                  Summarize Benchmark Results},
  journal = 	 cacm,
  year = 	 {1986},
  volume =	 {29},
  number =	 {3},
  month =	 mar,
  pages =	 {218--221},
  annote =	 {Advocates the use of the geometric mean in
                  summarizing normalized benchmark results.}
}

@Article{smith88,
  author = 	 {James E. Smith},
  title = 	 {Characterizing Computer Performance with a Single Number},
  journal = 	 cacm,
  year = 	 {1988},
  volume =	 {31},
  number =	 {10},
  month =	 oct,
  pages =	 {1202--1206},
  annote =	 {Advocates the use of the arithmetic mean for times,
                  the harmonic mean for rates, and normalizing after
                  aggregating (i.e., not averaging normalized numbers).}
}

@Book{knuth99,
  author =	 {Donald E. Knuth},
  title =	 {Digital Typography},
  publisher =	 {CSLI Publications},
  year =	 {1999},
  address =	 {Stanford, CA},
  annote =	 {A collection of articles and other stuff about \TeX
                  and Metafont.}
}

@InProceedings{johnstone&wilson98,
  author =	 {Mark S. Johnstone and Paul R. Wilson},
  title =	 {The Memory Fragmentation Problem: Solved?},
  booktitle =	 {International Symposium on Memory Management (ISMM
                  '98)},
  pages =	 {26--36},
  year =	 {1998},
  volume =	 {34},
  number =	 {3},
  series =	 {SIGPLAN Notices},
  annote =	 {Compares a number of memory allocation strategies
                  wrt fragmentation (with explicit deallocation),
                  using several real programs as workload (instead of
                  a synthetic workload). A number of strategies
                  exhibit very low fragmentation. In particular, all
                  the best fit strategies did quite well (best with
                  address ordered free lists), and also first fit with
                  address ordering, and Doug Lea's allocator. The
                  paper gives a number of explanations for these
                  results.}
}

@InProceedings{chang&gibson99,
  author =	 {Fay Chang and Garth A. Gibson},
  title =	 {Automatic I/O Hint Generation through Speculative
                  Execution},
  crossref =	 {osdi99},
  pages =	 {1--14},
  annote =	 {While the actual process is blocked by reads, a
                  speculative thread runs ahead, and generates read
                  hints (used for prefetching) instead of reads (and,
                  of course, it does not produce output). If the
                  speculative thread appears to stray from the normal
                  execution path or fall behind, it is reinitialized
                  from the current state of the main thread. This idea
                  has been tested on three read-intensive programs and
                  produced 29\%--70\% speedup on a machine with a
                  4-disk RAID~0 (fewer disks produce less speedup,
                  more disks hardly more). If the hints were ignored,
                  the slowdown was 1\%--4\%, indicating that the
                  scheme costs little if it is useless; however, the
                  Gnuld benchmark had a 15\% slowdown when using the
                  hints with one disk, apparently because scarce disk
                  bandwidth is wasted on useless prefetches. For two
                  of the benchmarks the hints are quite accurate, but
                  for Gnuld many hints caused prefetches of useless
                  blocks, and useful blocks were not prefetched; Gnuld
                  was also the only benchmark where manual prefetching
                  seriously outperformed the new technique.}
}

@InProceedings{pai+99,
  author =	 {Vivek S. Pai and Peter Druschel and Willy
                  Zwaenepoel},
  title =	 {IO-Lite: A Unified I/O Buffering and Caching System},
  crossref =	 {osdi99},
  pages =	 {15--28},
  annote =	 {IO-Lite avoids copying by using arbitrary-sized
                  read-only buffers and passing (arrays of)
                  descriptors for these buffers around. Thus data can
                  be read in from a file into a web server and passed
                  to the networking code without ever being
                  copied. The paper reports a throughput increase of
                  40\%--80\% for webserving. To get all the
                  performance advantages, the applications need to use
                  new interfaces, but even with the stdio interface
                  there is a measureable speedup (1.03) in gcc. The
                  paper is a little light on how the applications
                  and the OS manage the buffers.}
}

@InProceedings{wang+99,
  author =	 {Randolph Y. Wang and Thomas E. Anderson and David
                  A. Patterson},
  title =	 {Virtual Log Based File System for a Programmable
                  Disk},
  crossref =	 {osdi99},
  pages =	 {29--43},
  annote =	 {This paper contains a number of interesting ideas,
                  but IMO many of the conclusions are flawed or
                  unsupported. The first interesting idea is that
                  nowadays disks are so intelligent that they could
                  take over more tasks, including providing the file
                  system (but how does that fit with RAIDs?). This
                  paper, however, appears to focus on the idea that
                  the disk intelligence is used to organize the disk
                  as a log (with compaction done by the disk); the
                  special twist in this log is that it tries to write
                  immediately, on any free block close to the current
                  head position (eager writing), instead of writing
                  the log in segments. Unfortunately cleaning (the
                  reason for having segments in normal logs) is
                  discussed in a tech report, not in the paper. The
                  main benefit of this scheme claimed by the paper is
                  better performance on small synchronous writes; I
                  see two flaws here: 1) The paper suggests
                  alleviating the read performance problems of this
                  scheme by using large caches; thus there will hardly
                  be any disk reads between writes and eager writing
                  should not perform significantly better than a
                  conventional log (and their data seems to support
                  this view). 2) The paper assumes that the pointer to
                  the end of the log will be written to a fixed block
                  upon power failure; if the disk is able to do that,
                  it should also be able to write a full track of data
                  to the disk upon power failure; the disk could use
                  this to delay writes, as if it had a full track of
                  NVRAM, and use this to optimize writing and thus
                  alleviate the small-synchronous-writes problem. The
                  paper gives numbers from
                  synthetic benchmarks using a simulation of the disks
                  (apparently assuming that the disk writes
                  synchronously), showing significant improvements of
                  the virtual log disk for UFS, and for some scenarios
                  also over a log-structured file system.}
}

@InProceedings{dougan+99,
  author =	 {Cort Dougan and Paul Mackeras and Victor Yodaiken},
  title =	 {Optimizing the Idle Task and Other {MMU} Tricks},
  crossref =	 {osdi99},
  pages =	 {229--237},
  url =		 {http://hq.fsmlabs.com/~cort/papers/linuxppc-mm/linuxppc-mm.ps},
  annote =	 {Discusses various optimizations of memory management
                  stuff in Linux/PPC. The optimizations are: mapping
                  the kernel with BAT (block address translation)
                  registers instead of the TLB; better choice of
                  segment IDs (VSIDs) to get a higher hash table hit
                  ratio; hand optimizing the TLB miss code; on the
                  603, don't use the hash tables upon TLB miss, use
                  the page tables directly; instead of flushing stale
                  entries from the TLB and hash table, just change the
                  involved VSID; turn off the cache on TLB miss to
                  avoid polluting the cache with page table entries;
                  clear free pages in the idle task, with caches
                  turned off. Not all of these optimizations are
                  supported with convincing data in the paper, but the
                  effect of their combination is quite good. One
                  interesting result was that apparently the kernel
                  compile benchmark was originally suffering quite a
                  lot from TLB misses (just mapping the kernel with
                  BATs reduced wall-clock time by a factor 1.25).}
}

@InProceedings{hutchinson+99,
  author =	 {Norman C. Hutchinson and Stephen Manley and Mike
                  Federwisch and Guy Harris and Dave Hitz and Steven
                  Kleiman and Sean O'Malley},
  title =	 {Logical vs. Physical File System Backup},
  crossref =	 {osdi99},
  pages =	 {239--249},
  annote =	 {Discusses and measures logical (i.e., file-based)
                  vs. physical (i.e., block-based) file system
                  backups, in particular in the context of the WAFL
                  file system; WAFL (a variation on log-structured
                  file systems) makes it possible to have incremental
                  physical backups. On the performance side, physical
                  backup is about 20\% faster than dump for a single
                  tape drive, uses much less CPU (5\% vs. 25\%), and
                  scales better for more tape drives (logical backup
                  becomes disk-bound with four tapes and also consumes
                  90\% of the CPU). The paper also contains a bit of
                  info on WAFL.}
}

@InProceedings{groenvall+99,
  author = 	 {Bj\"orn Gr\"onvall and Assar Westerlund and Stephen Pink},
  title = 	 {The Design of a Multicast-Based Distributed File System},
  crossref =	 {osdi99},
  pages =	 {251--264},
  annote =	 {Describes the design of JetFile.}
}

@InProceedings{gopal&manber99,
  author =	 {Burra Gopal and Udi Manber},
  title =	 {Integrating Content-Based Access Mechanisms with
                  Hierarchical File Systems},
  crossref =	 {osdi99},
  pages =	 {265--278},
  annote =	 {Includes a long Related Work section.}
}

@Proceedings{osdi99,
  title = 	 {Operating Systems Design and Implementation (OSDI '99)},
  booktitle = 	 {Operating Systems Design and Implementation (OSDI '99)},
  year = 	 {1999},
  key =		 {OSDI '99}
}

@PhdThesis{pelegri-llopart88,
  author = 	 {Eduardo Pelegri-Llopart},
  title = 	 {Rewrite Systems, Pattern Matching, and Code Generation},
  school = 	 {University of California, Berkeley},
  year = 	 {1988}
}

@InProceedings{serrano97,
  author =	 {Manuel Serrano},
  title =	 {Inline Expansion: \emph{When} and \emph{How}},
  booktitle =	 {Programming Languages, Implementation and Logic
                  Programming (PLILP)},
  pages =	 {143--157},
  year =	 {1997},
  volume =	 {1292},
  series =	 {LNCS},
  publisher =	 {Springer},
  annote =	 {Examines a number of heuristics for determining
                  which calls to inline and then presents his own;
                  unfortunately the empirical evaluation does not
                  compare the approaches, only different decision
                  functions and their parameters for the new
                  framework. The \emph{how} part discusses how to
                  treat recursive functions. One interesting effect is
                  that the worst code growth of any combination of
                  decision function and parameters is only 1.08 and
                  there are even code shrinks down to a factor 0.82
                  (for Scheme benchmarks).}
}

@InProceedings{debosschere+94,
  author =	 {Koen De Bosschere and Saumya Debray and David
                  Gudeman and Smapath Kannan},
  title =	 {Call Forwarding: A Simple Interprocedural
                  Optimization Technique for Dynamically Typed
                  Languages},
  crossref =	 {popl94},
  pages =	 {409--420},
  annote =	 {Attacks the problem of type checks at the entry of
                  procedures in dynamically-typed languages, or, in
                  general, any kind of entry action (e.g.,
                  unboxing). For some call sites some of these entry
                  actions are unnecessary (e.g., for typechecks,
                  because the type is known). The paper tries to
                  exploit this for optimization by having several
                  entry points for the procedure, and every call site
                  calling an entry point that precedes all necessary
                  (and possibly some unnecessary) entry actions. The
                  entry actions can be ordered to minimize executing
                  unnecessary entry actions; this problem is
                  NP-complete. Another approach to dealing with the
                  problem is to inline some entry actions into the
                  call sites; the paper claims that this apprach leads
                  to significant code bloat in a number of application
                  programs. The paper introduces a greedy algorithm
                  for ordering the entry actions and performing
                  bounded inlining of entry actions; this algorithm
                  performs optimally for all presented benchmarks, but
                  the paper does not tell what bound on code growth
                  was used (except maybe in footnote 3; it appears to
                  imply that the implementation of the algorithm
                  copies just enough to avoid all unnecessary entry
                  actions; then the optimiality is trivial, but what
                  about code growth (no data presented for
                  that)?). The improvements in execution time are
                  12\%--45\%.}
}

@Proceedings{popl94,
  booktitle = 	 "Principles of Programming Languages (POPL '94)",
  title = 	 "Principles of Programming Languages (POPL '94)",
  year = 	 "1994",
  key =		 "POPL '94"
}

@InProceedings{goubault94,
  author =	 {Jean Goubault},
  title =	 {Generalized Boxings, Congruences and Partial
                  Inlining},
  booktitle =	 {Static Analysis Symposium (SAS '94)},
  pages =	 {147--161},
  year =	 {1994},
  volume =	 {864},
  series =	 {LNCS},
  publisher =	 {Springer},
  annote =	 {Discusses several ways to optimize boxing and
                  unboxing. One of the approaches is partial inlining:
                  inlining the unboxing at the start and the unboxing
                  at the end into the caller, and then optimizing it
                  away; this can be extended to also inline tests like
                  ML's pattern matching code. The paper reports no
                  practical experience with partial inlining.}
}

@Article{kaser&ramakrishnan98,
  author =	 {Owen Kaser and C. R. Ramakrishnan},
  title =	 {Evaluating Inlining Techniques},
  journal =	 complang,
  year =	 {1998},
  volume =	 {24},
  OPTnumber =	 {},
  pages =	 {55--72},
  annote =	 {Inlining the original versions of procedures is more
                  powerful than inlining the current version (which may
                  have some inlinings already applied), but the
                  difference only plays a role for recursive
                  procedures. The paper also discusses an inlining
                  heuristic based on call frequencies; with this
                  heuristic (and probably others), original-version
                  inlining thends to have problems with local maxima,
                  so the paper proposes a hybrid strategy. The paper
                  presents an empirical comparison of the techniques
                  it presents and some others; the hybrid strategy
                  removes 36\%--97\% of all inlinable calls with 5\%
                  code expansion and 62\%--100\% with 20\% code
                  growth.}
}

@Book{wulf+75,
  author = 	 {William Wulf and Richard K. Johnsson and Charles
                  B. Weinstock and Steven O. Hobbs and Charles M. Geschke},
  title = 	 {The Design of an Optimizing Compiler},
  publisher = 	 {Elsvier},
  year = 	 {1975},
  isbn =	 {0-444-0164-6},
  annote =	 {Describes a complete Bliss/11 compiler for the
                  PDP-11.  It uses some interesting techniques: it
                  uses a (hand-constructed) tree parsing automaton for
                  parts of the code selection (Section~3.4); it
                  optimizes the use of unary complement operators
                  (Section~3.3); it uses a smart scheme to represent
                  a conservative approximation of the lifetime of
                  variables in constant space and uses that for
                  register allocation (Sections~4.1.3 and~4.3).}
}

@Article{rosenblum&ousterhout92,
  author =	 {Mendel Rosenblum and John K. Ousterhout},
  title =	 {The Design and Implementation of a Log-Structured
                  File System},
  journal =	 tocs,
  year =	 {1992},
  volume =	 {10},
  number =	 {1},
  pages =	 {26--52},
  annote =	 {Gives a rather high-level description of the Sprite
                  log-structured file system; there are too many
                  details missing for my taste. But the discussion of
                  cleaning and, in particular, cleaning policies is
                  quite detailed; the paper compares a greedy
                  heuristic that cleans segments with least
                  utilization, with a cost/benefit heuristic that
                  tends to clean segments with old (and
                  rarely-changing) data with higher utilization than
                  young segments; the cost/benefit heuristic works
                  significantly better. The intuition is that old data
                  is unlikely to die soon, so we might just as well
                  reclaim the free space associated with it now and
                  use it, instead of waiting for a long time until
                  utilization falls below the threshold of the greedy
                  heuristic. The file system maintains a segment usage
                  table to have data for this heuristic. The paper
                  also presents empirical data showing that the Sprite
                  LFS is much faster than SunOS FFS on small files,
                  and that the cleaning behaviour on real workloads is
                  even better than in the simulations they presented
                  earlier. One interesting statement was that the
                  Sprite LFS is not more complex than FFS: it needs
                  log handling and the cleaner, but can skip complex
                  block allocation algorithms and fsck.}
}

@InProceedings{tweedie98,
  author =	 {Stephen Tweedie},
  title =	 {Journaling the {Linux} ext2fs Filesystem},
  booktitle =	 {LinuxExpo '98},
  year =	 {1998},
  url =
                  {ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz},
  annote =	 {A nice description of adding journaling to a
                  conventional file system.}
}

@InProceedings{mckusick&ganger99,
  author =	 {Marshall Kirk McKusick and Gregory R. Ganger},
  title =	 {Soft Updates: A Technique for Eliminating Most
                  Synchronous Writes in the Fast Filesystem},
  crossref =	 {freenix99},
  pages =	 {1--17},
  year =	 {1999},
  annote =	 {Soft updates enhance the BSD FFS for safe
                  asynchronous writing and fast crash recovery. The
                  data and meta-data is written in such an order that
                  the only inconsistency on disk can be allocated, but
                  unused blocks or inodes. Thus the file system can be
                  mounted without being checked. Soft updates also
                  increase the speed of updating the file system by
                  requiring fewer writes and hardly any synchronous
                  writes. The paper gives a pretty detailed account of
                  the changes in FFS, discusses the experiences, and
                  provides some empirical results (showing that soft
                  updates can be quite a bit faster than FFS's
                  synchronous metadata updates and close to plain
                  asynchronous updates). The paper also explains how
                  to do snapshots on a conventional file system: A
                  snapshot is a sparse file with the size of the file
                  system; when a block in the file system is written,
                  a copy of the old contents are written into the
                  snapshot. Soft updates uses snapshots for running
                  fsck (to reclaim lost space) and backups while
                  having the file system mounted and writable.}
}

@Proceedings{freenix99,
  title = 	 {FREENIX Track, USENIX Annual Technical Conference},
  booktitle =	 {FREENIX Track, USENIX Annual Technical Conference},
  year = 	 {1999},
  OPTkey =		 {FREENIX '99}
}

@InProceedings{blackwell+95,
  author =	 {Trevor Blackwell and Jeffrey Harris and Margo
                  Seltzer},
  title =	 {Heuristic Cleaning Algorithms in Log-Structured File
                  Systems},
  crossref =	 {usenix95},
  pages =	 {277--288},
  url =		 {http://www.eecs.harvard.edu/~tlb/usenixw95.ps},
  annote =	 {The authors gathered traces of file accesses by
                  snooping the network traffic of three dedicated NFS
                  servers in two environments (university and
                  business). Then they used these traces on an LFS
                  simulator to measure the effects of cleaning and
                  evaluate some cleaning parameters (but not
                  heuristics involving, e.g., age, utilization etc.);
                  the file systems were 90\% full. Results: After two
                  seconds idle time, there is a high probability
                  (86\%--98\%) for another two seconds idle time;
                  cleaner interference is small: an average of
                  0.02--0.07 cleaner requests are in the queue when a
                  user request arrives. Most cleaning is done in the
                  background, on-demand cleaning is rare (0\%--3.3\%
                  of the segments cleaned). The amount of data written
                  between idle gaps for cleaning was usually less than
                  16MB and never more than 350MB. 70\%--90\% of the
                  requests could be satisfied immediately, and nearly
                  all within 100ms.}
}

@InProceedings{seltzer+95,
  author =	 {Margo Seltzer and Keith A. Smith and Hari
                  Balakrishnan and Jacqueline Chang and Sara McMains
                  and Venkata Padmanabhan},
  title =	 {File System Logging Versus Clustering: A Performance
                  Comparison},
  crossref =	 {usenix95},
  OPTpages =	 {},
  url =          {http://www.eecs.harvard.edu/~margo/usenix.195/usenix.195.ps.gz},
  critique-url = {http://www.scriptics.com/people/john.ousterhout/seltzer.html},
  sources-url =	 {http://www.eecs.harvard.edu/~margo/usenix.195/},
  annote =	 {This paper compares the BSD LFS with an improved
                  version of BSD FFS using various benchmarks}
}

@InProceedings{vahalia+95,
  author =	 {Uresh Vahalia and Cary G. Gray and Dennis Ting},
  title =	 {Metadata Logging in an NFS Server},
  crossref =	 {usenix95},
  pages =	 {265--276},
  annote =	 {Describes the addition of meta-data-only journaling
                  to BSD FFS in order to get good latency for NFS
                  writes (hmm, doesn't NFS also require data writes to
                  be persistent before reporting
                  completion?). Contains some interesting discussions
                  of various aspects (e.g., replaying logical updates
                  is a problem, so physical logging is preferable).}
}

@Proceedings{usenix95,
  booktitle = 	 {Usenix Annual Technical Conference},
  title = 	 {Usenix Annual Technical Conference},
  year = 	 {1995},
  key =		 {Usenix '95}
}

@Article{johnson&laing96,
  author =	 {James E. Johnson and William A. Laing},
  title =	 {Overview of the {Spiralog} File System},
  journal =	 {Digital Technical Journal},
  year =	 {1996},
  volume =	 {8},
  number =	 {2},
  pages =	 {5--14},
  url =		 {},
  annote =	 {The Spiralog file system is based on a
                  log-structured local file system, remote access
                  provided by a client(clerk)/server combination, a
                  low-level file system interface (VPI) on which two
                  file system personalities are built and a backup
                  system. This paper gives an overview, and there are
                  companion papers \cite{whitaker+96,green+96}
                  describing the components in more detail.}
}

@Article{whitaker+96,
  author =	 {Christopher Whitaker and J. Stuart Bayley and Rod
                  D. W. Widdowson},
  title =	 {Design of the Server for the {Spiralog} File System},
  journal =	 {Digital Technical Journal},
  year =	 {1996},
  volume =	 {8},
  number =	 {2},
  pages =	 {15--31},
  url =		 {http://www.digital.com/info/DTJM02/DTJM02P8.PS},
  annote =	 {Discusses the on-disk structure and related stuff of
                  the Spiralog file system and includes some hindsight
                  comments. Spiralog is a log-structured file system
                  that uses 256KB-segments for allocation and
                  cleaning; in contrast to block-oriented FSs like the
                  BSD FFS Spiralog's basic objects are byte streams
                  and named cells, organized in B-trees. The file
                  system is organized into a log driver layer that
                  provides an infinite log address space (contrast to
                  \cite{dejonge+93}, where a fixed-size
                  update-in-place address space is provided), a
                  mapping layer that maps the objects onto this log
                  address space, and the cleaner. Each segment
                  contains a data area and a 24KB commit record area,
                  making it easy to distinguish commits from user data
                  but incurring a space and time penalty. Cleaning is
                  performed on-demand (when fewer than 300 segments
                  are available); the cleaner's goal is to write full
                  segments, emptying other segments is a side
                  effect. Spiralog's backup utilities work at the log
                  driver layer, with interesting consequences:
                  segments that are part of a snapshot cannot be
                  cleaned; incremental backups just backup a range in
                  the log address space; no support for user-visible
                  snapshots is needed or provided.}
}

@Article{green+96,
  author =	 {Russel J. Green and Alasdair C. Baird and
                  J. Christopher Davies},
  title =	 {Designing a Fast, On-line Backup System for a
                  Log-structured File System},
  journal =	 {Digital Technical Journal},
  year =	 {1996},
  volume =	 {8},
  number =	 {2},
  pages =	 {32--45},
  url =		 {},
  annote =	 {The Spiralog backup system is primarily physical,
                  but uses log addresses instead of physical addresses
                  (avoiding a dependence on partition sizes); it
                  copies the live segments to tape completely (i.e.,
                  including dead blocks); incremental backups are
                  performed by backing up a range of segments (which
                  may conmtain only parts of changed files, but also
                  old stuff copied by the cleaner). This avoids the
                  need to have logical snapshots, simplifying the
                  cleaner (with physical snapshots it just has to
                  refrain from cleaning the segments in the
                  snapshot).}
}

@InProceedings{matthews+97,
  author =	 {Jeanna Neefe Matthews and Drew Roselli and Adam
                  M. Costello and Randolph Y. Wang and Thomas
                  E. Anderson},
  title =	 {Improving the Performance of Log-Structured File
                  Systems with Adaptive Methods},
  booktitle =	 {Sixteenth ACM Symposium on Operating System
                  Principles (SOSP '97)},
  OPTpages =	 {},
  year =	 {1997},
  url =          {http://www.cs.berkeley.edu/~neefe/papers/sosp97/sosp97.ps},
  annote =	 {Examines various optimizations and tuning issues for
                  an LFS, and approaches to tune the file system
                  automatically to the workload (self-tuning). The
                  results are based on simulations of an LFS and a
                  disk; the benchmarks used are the Berkeley Auspex
                  trace, and a synthetic random update
                  benchmark. Write cost for the Auspex trace and 85\%
                  disk utilization is minimal at a segment size of
                  about 4$\times$access~time$\times$bandwidth; for the
                  random workload, write cost is minimizes at
                  single-block segments. Hole-plugging (with metadata
                  in a block header) is never better than cleaning for
                  the Auspex workload, and better than cleaning at
                  $>85\%$ utilization (later on disks with more
                  bandwidth$\times$access time). Even at 99\%
                  utilization, the write cost of the Auspex workload
                  is only 3. An adaptive version that switches between
                  hole-plugging and segment cleaning performed about
                  as good as the best of hole-plugging and segment
                  cleaning for the respective utilization for the
                  random workload; for the Auspex workload it
                  performed slightly better than either beyond 98\%
                  utilization. Another optimization the authors
                  propose is prefering segments that are still cached
                  (from writing) when cleaning, saving read cost; they
                  report some nice speedups for large cache sizes and
                  high utilization (write cost reduction from 2.4 to
                  1.9 with 1GB cache and 95\% utilization); however,
                  the evaluation seems to compare against a crippled
                  version of the original heuristic that does not
                  profit from the cache at all (i.e., no lucky hits in
                  the cache), so I think it does not say much about
                  the value of the new heuristic, but mainly about the
                  value of caching. The paper also proposes optimizing
                  read performance by recording frequent sequences of
                  block reads and copying them together, with
                  promising results; the average per-block read
                  response times seem large, however (6.5ms--12ms);
                  are the files in the benchmark so small? The paper
                  gives many references.}
}

@InProceedings{chutani+92,
  author =	 {Sailesh Chutani and Owen T. Anderson and Michael
                  L. Kazar and Bruce W. Leverett and W. Anthony Mason
                  and Robert N. Sidebotham},
  title =	 {The {Episode} File System},
  booktitle =	 {Usenix Conference},
  pages =	 {43--60},
  year =	 {1992},
  month =	 {Winter},
  annote =	 {Episode is the local file system of the AFS 4 (and
                  DCE DFS) distributed file system. Episode is a
                  journaling file system that logs only metadata
                  updates; it logs both the new and the old value of
                  the data being modified, reducing the constraints on
                  the order of writes (since changes can be rolled
                  back); the paper discusses the issues of using and
                  implementing transactions internal to the file
                  system in some detail. Episode supports multiple
                  filesets per volume. It also supports read-only
                  clones of filesets, using block-level copy-on-write
                  in the implementation; during cloning a fileset all
                  inodes are copied, and the COW bit set in every
                  block pointer; when the file system tries to write
                  to a block that has the COW bit set, a new block is
                  allocated and the data is written there (and the
                  pointer to it has a cleared COW bit. The paper gives
                  empirical data comparing Episode to both BSD FFS and
                  JFS, where Episode does quite well (except for CPU
                  time, which the paper explains with missing
                  tuning). The paper also gives results on recovery
                  times, showing that the recovery time varied highly,
                  but correlated with the number of active processes
                  at the time of the crash.}
}

@InProceedings{roome92,
  author =	 {W. D. Roome},
  title =	 {{3DFS}: A Time-Oriented File Server},
  booktitle =	 {Usenix Conference},
  pages =	 {405--418},
  year =	 {1992},
  month =	 {Winter},
  OPTannote =	 {Describes the interface and the implementation of an
                  NFS server that allows access to old versions of the
                  files (the changed files are typically recorded at
                  the granularity of a day). The server uses a WORM
                  jukebox together with a magnetical indexing and
                  cache disk; it can recover from failure of the
                  magnetical disk.}
}

@InProceedings{hitz+94,
  author =	 {Dave Hitz and James Lau and Michael Malcolm},
  title =	 {File System Design for an {NFS} File Server
                  Appliance},
  booktitle =	 {Usenix Conference},
  OPTpages =	 {},
  year =	 {1994},
  month =	 {Winter},
  url =		 {http://www.netapp.com/library/tr/3002.pdf},
  annote =	 {If you want to learn about log-structured file
                  systems, start by reading this paper! It presents
                  the on-disk structure of a no-update-in-place file
                  system (WAFL) in a clear way. The difference to the
                  usual log-structured file systems is that WAFL
                  manages the free space with a block map instead of
                  using segments and a cleaner. Another difference is
                  that WAFL can deal with several snapshots (the block
                  map contains 32 bits per block, with one bit for
                  every snapshot). The file system is for a dedicated
                  NFS server. This server contains NVRAM, allowing
                  quick acknowledgements of writes and allows writing
                  to the disk in large batches (the on-disk structure
                  would result in a large proportion of meta-data
                  writes if data had to be written in small
                  batches). The NVRAM stores the write requests as
                  they come in, not the resulting disk blocks; this
                  maximizes the number of requests that can be stored
                  in the NVRAM; on recovery the requests are simply
                  replayed.}
}

@InProceedings{ruemmler&wilkes93,
  author =	 {Chris Ruemmler and John Wilkes},
  title =	 {{UNIX} Disk Access Patterns},
  booktitle =	 {Usenix Conference},
  pages =	 {405--420},
  year =	 {1993},
  month =	 {Winter},
  url =
                  {http://www.hpl.hp.com/personal/John_Wilkes/papers/USENIX.Jan93.ps.Z},
  annote =	 {Presents performance data at the disk driver level
                  about traces on several machines under HP/UX. Most
                  accesses were through a BSD FFS file system, but
                  they also recorded swapping and paging. One
                  interesting result was that increasing the cache
                  hardly reduced read accesses.}
}

@InProceedings{seltzer+93,
  author =	 {Margo Seltzer and Keith Bostic and Marshall Kirk
                  McKusick and Carl Staelin},
  title =	 {An Implementation of a Log-Structured File System
                  for {UNIX}},
  booktitle =	 {Usenix Conference},
  pages =	 {307--326},
  year =	 {1993},
  month =	 {Winter},
  critique-url = {http://www.scriptics.com/people/john.ousterhout/seltzer93.html},
  annote =	 {This paper describes the BSD LFS; it gives more
                  details about the on-disk structure of an LFS than
                  \cite{rosenblum&ousterhout92} and should therefore
                  be read first (or better, read \cite{hitz+94}
                  first). The paper also presents some differences
                  from and improvements over Sprite LFS: less memory
                  consumption, dealing with almost-full disks,
                  user-level cleaning, fewer on-disk structures
                  outside files, dealing with directory operations by
                  having non-committing segment summaries instead of
                  introducing special log entries; one disadvantage
                  over Sprite LFS is that BSD LFS does not use the
                  information in segment summaries to roll forward,
                  and therefore needs to write out all dirty metadata
                  on every checkpoint. The paper also presents some
                  performance measurements, comparing BSD LFS with BSD
                  FFS and an improves version of FFS.}
}

@Article{ousterhout&douglis89,
  author =	 {John Ousterhout and Fred Douglis},
  title =	 {Beating the {I/O} Bottleneck: A Case for
                  Log-Structured File Systems},
  journal =	 {Operating Systems Review},
  year =	 {1989},
  volume =	 {23},
  number =	 {1},
  pages =	 {11--28},
  month =	 jan,
  annote =	 {Predicts that I/O (in particular, disk seeks during
                  file access) would become a bottleneck, because the
                  CPUs are becoming faster faster than disk seeks;
                  they were certainly right about the speedups, but
                  the file access bottleneck has not happened in my
                  environment yet (1999); we still have systems with
                  0.08 disks per concurrent user on a 1200~MIPS
                  machine doing software development, whereas the
                  paper predicts requiring 20--80 disks per user on a
                  500~MIPS machine. The paper discusses a number of
                  solutions for these problems: large file caches for
                  eliminating most reads (this happens on our system),
                  write-back caches with battery backup and
                  alternatively, cache logging, and log-structured
                  file systems.  The paper presents log-structured file
                  systems in some depth.}
}

@InProceedings{douglis&ousterhout89,
  author =	 {Fred Douglis and John Ousterhout},
  title =	 {Log-Structured File Systems},
  booktitle =	 {{IEEE COMPCON}},
  pages =	 {124--129},
  year =	 {1989},
  annote =	 {A shorter version of \cite{ousterhout&douglis89}. I
                  recommend reading the longer version.}
}

@TechReport{dejonge+93,
  author =	 {de Jonge, Wiebren and M. Frans Kashoek and Wilson
                  C. Hsieh},
  title =	 {Logical Disk: A Simple New Approach to Improving
                  File System Performance},
  institution =	 {MIT},
  number =	 {LCS/TR-566},
  institution2 = {Vrije Universiteit Amsterdam},
  number2 =	 {IR-325},
  year =	 {1993},
  note =	 {A paper on the same topic appeared at SOSP '93},
  url =
                  {ftp://ftp-pubs.lcs.mit.edu/pub/lcs-pubs/tr.outbox/MIT-LCS-TR-566.ps.gz},
  annote =	 {The logical disk provides an interface slightly
                  above the device driver interface that provides the
                  performance advantages of log-structured file
                  systems with few changes to the file
                  system. Disadvantages: high RAM consumption for the
                  logical-to-physical map, long recovery times (every
                  segment summary has to be read to recover the map,
                  then the fsck has to be performed), none of the
                  functionality advantages of an LFS. The paper
                  reports performance for an adapted version of the
                  Minix FS.}
}

@InProceedings{lee+99,
  author =	 {Yui-Wah Lee and Kwong-Sak Leung and Mahadev
                  Satyanarayanan},
  title =	 {Operation-based Update Propagation in a Mobile File
                  System},
  crossref =	 {usenix99},
  pages =	 {43--56},
  url =
                  {http://www.cs.cmu.edu/afs/cs/project/coda/Web/docdir/hcimd98.pdf},
  abstract-url =
                  {http://www.cs.cmu.edu/afs/cs/project/coda/Web/absdir/s15-abstract.html},
  annote =	 {The amount of data transferred for keeping a mobile
                  and a server file system in sync is reduced by
                  transferring operations that change files (e.g.,
                  invocations of the compiler), running them on the
                  remote system, comparing the results (with
                  checksums), and only transferring the full file if
                  the results differ.}
}

@InProceedings{zadok+99,
  author =	 {Erez Zadok and Ion Badulescu and Alex Shender},
  title =	 {Extending File Systems Using Stackable Templates},
  crossref =	 {usenix99},
  pages =	 {57--70},
  html-url =	 {http://www.cs.columbia.edu/~ezk/research/wrapfs/},
  ps-url =       {http://www.cs.columbia.edu/~ezk/research/wrapfs/wrapfs.ps},
  annote =	 {Describes the design and implementation of Wrapfs,
                  which provides a relatively simple interface for
                  adding functionality (e.g., encryption)
                  to file systems.}
}

@InProceedings{shriver+99,
  author = 	 {Elizabeth Shriver and Christopher Small and Keith A. Smith},
  title = 	 {Why Does File System Prefetching Work?},
  crossref =	 {usenix99},
  pages =	 {71--84},
  url = 	 {http://www.bell-labs.com/user/shriver/postscript/prefetching-usenix99.ps}
}

@InProceedings{brecht&sandhu99,
  author =	 {Tim Brecht and Harjinder Sandhu},
  title =	 {The Region Trap Library: Handling Traps on
                  Application-Defined Regions of Memory},
  crossref =	 {usenix99},
  pages =	 {85--99},
  url =
                  {http://bbcr.uwaterloo.ca/~brecht/papers/postscript/usenix99.ps},
  html-url =
                  {http://bbcr.uwaterloo.ca/~brecht/papers/html/usenix99/paper.html},
  annote =	 {This library allows the user to get MMU traps for
                  arbitrarily-sized regions (not just page-aligned
                  regions) and to change the protection level of
                  regions; it works by mapping (the current version
                  uses copying, but mapping should be a simple
                  optimization) all the regions several times into
                  memory with different protection levels, and
                  swizling pointers to a specific region with a
                  specific protection to point to the copy with the
                  appropriate protection. This requires that pointers
                  to such regions be declared to the library, but the
                  resulting interface looks decent enough.}
}

@InProceedings{cranor&parulkar99,
  author =	 {Charles D. Cranor and Gurudatta M. Parulkar},
  title =	 {The {UVM} Virtual Memory System},
  crossref =	 {usenix99},
  pages =	 {117--130},
  annote =	 {Presents the UVM system for NetBSD and an empirical
                  comparison with the old BSD VM system.}
}

@InProceedings{miller&myers99,
  author =	 {Robert C. Miller and Brad C. Myers},
  title =	 {Lightweight Structured Text Processing},
  crossref =	 {usenix99},
  pages =	 {131--144},
  url =          {http://www.cs.cmu.edu/~rcm/papers/usenix99/usenix99.html},
  annote =	 {Among other things, this paper contains some
                  algebra-type stuff about dealing with text regions
                  and text constraints.}
}

@InProceedings{banga+99,
  author =	 {Gaurav Banga and Jeffrey C. Mogul and Peter
                  Druschel},
  title =	 {A Scalable and Explicit Event Delivery Mechanism for
                  {UNIX}},
  crossref =	 {usenix99},
  pages =	 {253--265},
  url =          {http://www.cs.rice.edu/~druschel/usenix99event.ps.gz},
  annote =	 {Discusses the scaling problems of the select() and
                  poll() interfaces for learning about events,
                  proposes a new, better scalable, but also more
                  complicated interface, and evaluates (an
                  implementation of) this interface.}
}

@InProceedings{deller&heiser99,
  author =	 {Luke Deller and Gernot Heiser},
  title =	 {Linking Programs in a Single Address Space},
  crossref =	 {usenix99},
  pages =	 {283--294},
  html-url =	 {http://www.cse.unsw.edu.au/~disy/papers/Deller_Heiser_99/index.html},
  url =		 {http://www.cse.unsw.edu.au/~disy/papers/Deller_Heiser_99/paper.ps.gz},
  abstract-url = {http://www.cse.unsw.edu.au/~disy/papers/Deller_Heiser_99/abstract},
  annote =	 {Discusses the issues involved in static and dynamic
                  linking in a normal and a single-address-space OS,
                  and gives some performace data. Recommended reading
                  if you want to learn about linking.}
}

@InProceedings{nightingale+99,
  author =	 {Tycho Nightingale and Yiming Hu and Qing Yang},
  title =	 {The Design and Implementation of a {DCD} Device
                  Driver for {Unix}},
  crossref =	 {usenix99},
  pages =	 {295--307},
  url =		 {ftp://ftp.ele.uri.edu/pub/tycho/USENIX99.ps.gz},
  annote =	 {DCD (Disk Caching Disk) adds journaling at the
                  device driver level (similar to the logical disk
                  \cite{dejonge+93} which adds log-structure at the
                  device driver level). The advantage of this approach
                  is that the file system does not need to be changed,
                  the disadvantage is that we do not get some of the
                  advantages of journaling (e.g., fast crash
                  recovery); the advantage that the paper emphasizes
                  is performance (compared to BSD FFS).}
}

@InProceedings{anderson&griffioen99,
  author =	 {Todd A. Anderson and James Griffioen},
  title =	 {An Application-Aware Data Storage Model},
  crossref =	 {usenix99},
  pages =	 {309--322},
  annote =	 {Proposes that the application should specify how
                  much persistence is wanted for a file; the OS (or,
                  in the paper, a distributed file system) can then
                  ensure that the valuable files are transferred soon
                  to several servers while the easily regenerated
                  files are typically just held locally on the clients
                  (possibly just in RAM), with possible automated help
                  in reconstructing the file.}
}

@Proceedings{usenix99,
  title = 	 {Usenix Annual Technical Conference},
  booktitle = 	 {Usenix Annual Technical Conference},
  year = 	 {1999},
  key =		 {Usenix '99}
}

@Book{corporaal98,
  author = 	 {Henk Corporaal},
  title = 	 {Microprocessor Architectures -- from VLIW to TTA},
  publisher = 	 {John Wiley \& Sons},
  year = 	 {1998},
  annote =	 {Transport-triggered architectures make the buses
                  used to transfer data between functional units and
                  register files architecturally visible and
                  programmable.  This book gives a comprehensive
                  discussion of TTAs, motivating it, discussing
                  various design issues and alternatives, giving
                  various theoretical and empirical results, and other
                  stuff (e.g., automatic generation of application
                  specific processors by optimization from a
                  template).  Compilation techniques are not covered
                  in depth, so you will have to look up the original
                  papers.  Recommended.}
}

@Book{higham98,
  author = 	 {Nicholas J. Higham},
  title = 	 {Handbook of Writing for the Mathematical Sciences},
  publisher = 	 {Society for Industrial and Applied Mathematics (SIAM)},
  year = 	 {1998},
  address =	 {Philadelphia},
  annote =	 {A nice book teaching many aspects of scientific
                  writing, from basic style issues to writing,
                  publishing, and presenting a paper.  There is some
                  discussion specific to mathematical writing, but
                  most of the book is more general in scope; many of
                  the examples have a maths or CS background, though.}
}

@PhdThesis{brandis95thesis,
  author =       "Marc M. Brandis",
  title =        "Optimizing Compilers for Structured Programming
                 Languages",
  school =       "Institute for Computer Systems, ETH Zurich",
  year =         "1995",
  type =         "{PhD} Dissertation",
  url =          "ftp://ftp.inf.ethz.ch/pub/publications/dissertations/th11024.ps.gz",
  number =       "11024",
  abstract =     "Modern processor architectures rely on optimizing
                 compilers to achieve high performance. Such
                 architectures expose details of their hardware to the
                 compiler, which has to deal with them in generating
                 machine code. This development has led to complex and
                 slow compilers, which are difficult to understand,
                 implement, and maintain. This thesis reports on methods
                 to simultaneously reduce the complexity and the
                 compile-time of optimizing compilers by more than a
                 decimal order of magnitude. It presents a novel
                 intermediate program representation, which integrates
                 data- and control-flow into a single data-structure.
                 This provides not just for simpler and faster
                 optimization algorithms, but also for more powerful
                 optimization techniques. The thesis also describes
                 single-pass algorithms to construct this intermediate
                 program representation from structured source code, as
                 well as single-pass techniques to transform programs
                 with restricted kinds of unstructured control-flow like
                 in Oberon into structured form. The integration of
                 these techniques with the parser allows to implement
                 fast and compact front-ends for structured programming
                 languages, that avoid the many auxiliary data
                 structures other optimizing compilers require. A
                 description of several optimization algorithms and how
                 they can be implemented on this intermediate program
                 representation shows the feasibility of the approach.
                 Most of these techniques have been implemented in a
                 prototypical optimizing compiler translating a subset
                 of the programming language Oberon for the PowerPC
                 architecture. Measurements on this compiler prove that
                 both the complexity and the compile-time of optimizing
                 compilers can be reduced by an order of magnitude when
                 translating a structured programming language and when
                 using this novel intermediate representation and the
                 associated algorithms. The thesis concludes with some
                 feedback to the programming language designers, which
                 language constructs cause undue complications in
                 optimizing compilers and should therefore be
                  omitted.",
  annote =	"Describes an optimizing Oberon compiler using SSA as
                  intermediate representation.  This can be even used
                  as a case-study style textbook on advanced compiler
                  construction (at least it impressed me more in this
                  respect than other compiler textbooks I have read
                  recently).  One other important aspect in this regard
                  is that the compiler is actually implemented and
                  mostly by one person.  Highly recommended."
}

@Book{appel98,
  author = 	 {Andrew W. Appel},
  title = 	 {Modern Compiler Implementation in C},
  publisher = 	 {Cambridge University Press},
  year = 	 {1998},
  annote =	 {The C version of the Tiger book.  The first half
                  presents the design for a complete compiler for
                  Tiger (a Pascal-like language) and, of course, the
                  techniques used in that design  (and some
                  surrounding material); the compiler itself is an
                  exercise. Then the book discusses handling various
                  language variations (e.g., OO and functional
                  programming), and various optimization techniques.}
}

@InProceedings{chung+00,
  author =	 {Yoo C. Chung and Soo-Mook Moon and Kemal
                  Ebcio\u{g}lu and Dan Sahlin},
  title =	 {Reducing Sweep Time for a Nearly Empty Heap},
  booktitle =	 {Symposium on Principles of Programming Languages
                  (POPL'00)},
  pages =	 {378--389},
  year =	 {2000},
  annote =	 {Proposes \emph{selective sweeping}, where the marked
                  objects are sorted by address, then just putting all
                  the gaps into the freelist. This requires asymptotic
                  complexity proportional to the number of live
                  objects. For some benchmarks, this method is slower
                  for smaller heap sizes than the other sweeping
                  method they are using; they propose \emph{adaptive
                  sweeping} that switches between the methods
                  depending on the fraction of live memory. Comments:
                  The sorting done explicitly in selective sweeping is
                  implicit in mark-and-sweep collectors using external
                  bitmaps; however, there clearing the bitmap before
                  each collection takes time proportional to heap
                  size; a more sophisticated data structure than a
                  bitmap could eliminate this. More fundamentally,
                  using a heap that is much larger than the live
                  memory is bad behaviour in a multi-tasking system,
                  so a garbage collector should try to keep the heap
                  size proportianal to live memory anyway, and
                  therefore the live-size vs. heap-size complexity
                  argument is irrelevant. And even if you allow huge
                  heap sizes, the argument becomes irrelevant as soon
                  as you consider the combination of allocation and GC
                  cost (you cannot GC memory that you have not
                  allocated).}
}

@InProceedings{bala+00,
  author =	 {Vasanth Bala and Evelyn Duesterwald and Sanjeev
                  Banerjia},
  title =	 {Dynamo: A Transparent Dynamic Optimization System},
  crossref =	 {sigplan00},
  pages =	 {1--12},
  annote =	 {Dynamo starts by emulating binaries while profiling
                  back-edges. When it decides that a loop header
                  instruction is hot, it generates a fragment (a
                  superblock that can cross call/return boundaries)
                  based on the specific trace executed that time,
                  stores that in the fragment cache, and uses it in
                  the future (until the fragment cache is flushed). It
                  also performs a few optimizations; however, the
                  results indicate that the main performance benefit
                  (on the PA-8000) comes from fragment formation
                  alone, i.e., from the partial inlining and the code
                  layout. The paper presents data taken mostly from
                  the SPEC benchmarks and Dynamo achieves speedups of
                  -2\%--22\% over HP's compiler with -O2. The
                  discussion also indicates that Dynamo loses on
                  programs with run-times $<1$min and on programs
                  without stable working sets. The first problem could
                  be worked around by using such a technique only on
                  binaries that have run for a while.}
}

@InProceedings{wilken+00,
  author =	 {Kent Wilken and Jack Liu and Mark Heffernan},
  title =	 {Optimal Instruction Scheduling Using Integer
                  Programming},
  crossref =	 {sigplan00},
  pages =	 {121--133},
  annote =	 {This paper improves the integer programming methods
                  used for optimal instruction scheduling in several
                  ways, finally getting all the basic blocks from
                  SPECfp95 for a single-issue machine with latencies
                  of 1-3 cycles (with up to about 1000 instructions)
                  to schedule in decent time. The improvements are:
                  DAG transformations like partitioning, eliminating
                  redundant edges, linearizing some regions (the
                  presented method works only for single-issue
                  machines, though); some consistency conditions that
                  should also work for CLP (and maybe dynamic
                  programming) formulations of the problem; and some
                  techniques specific to integer programming (but
                  maybe only attacking the problems inherent in that
                  technique). With all these improvements, they
                  scheduled a block with 1000 instructions, while
                  earlier optimal techniques were limited to about 30
                  instructions. It remains to be seen, however, how
                  well it does on more complex machine models. It
                  would also be interesting to see how CLP and dynamic
                  programming would benefit from the improvements
                  presented in this paper (at least those that are
                  applicable).}
}

@InProceedings{yi+00,
  author =	 {Qing Yi and Vikram Adve and Ken Kennedy},
  title =	 {Transforming Loops to Recursion for Multi-Level
                  Memory Hierarchies},
  crossref =	 {sigplan00},
  pages =	 {169--181},
  annote =	 {Recursive versions of various array processing
                  algorithms (e.g., dividing matrix multiplication
                  into eight multiplies and four additions of
                  quarter-sized matrices) have nice memory performance
                  characteristics, similar to cache blocking at all
                  cache levels, but they need not be tuned for cache
                  sizes. This paper presents an algorithm that
                  automatically transforms a loop-based program into a
                  recursion-based program, and some impressive
                  performance data for the results.}
}

@InProceedings{cannarozzi+00,
  author =	 {Dante J. Cannarozzi and Michael P. Plezbert and Ron
                  K. Cytron},
  title =	 {Contaminated Garbage Collection},
  crossref =	 {sigplan00},
  pages =	 {264--273},
  annote =	 {Presents a memory reclamation technique that
                  associates a stack frame with each object; it
                  guarantees that the object is dead when the stack
                  frame dies. The frame references in the objects may
                  have to be updated when references change. The
                  method is quite inaccurate (it may keep many dead
                  objects around), and therefore has to be
                  supplemented by a more accurate method (a more
                  conventional garbage collector); the supposed
                  benefit is that the conventional GC has to be
                  invoked fewer times, which may result in an overall
                  speedup if the conventional GC is slower in
                  reclaiming memory than the new approach. The paper
                  presents some data involving JDK 1.1.8 that support
                  this claim, but I am not convinced that this can be
                  generalized (how good is the JDK 1.1.8 GC?).}
}

@InProceedings{boothe00,
  author =	 {Bob Boothe},
  title =	 {Efficient Algorithms for Bidirectional Debugging},
  crossref =	 {sigplan00},
  pages =	 {299--310},
  annote =	 {Describes a debugger that can step backwards in the
                  program. A program compiled for this debugger
                  contains a call to a counter routine for every
                  statement. The slowdown from this is about a factor
                  of 2. If the program wants to step back one
                  statement, it is simply reexecuted until the counter
                  is one less than the current value. The paper also
                  describes stepping back by breakpoints, stepping back
                  over calls, stepping back to the start of the
                  current procedure, and backwards watchpoints. To
                  speed up reexecution, the debugger creates
                  checkpoints (with fork). One complication in
                  reexecution is (I/O) system calls; the debugger does
                  not reexecute them, but returns the same results
                  that they returned upon first execution.}
}

@Proceedings{sigplan00,
  booktitle = 	 "SIGPLAN '00 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '00 Conference on Programming Language
		  Design and Implementation",
  year = 	 "2000",
  key = 	 "PLDI '00"
}

@InProceedings{oskin+00,
  author =	 {Mark Oskin and Frederic T. Chong and Matthew
                  Farrens},
  title =	 {HLS: Combining Statistical and Symbolic Simulation
                  to Guide Microprocessor Designs},
  crossref =	 {isca00},
  pages =	 {71--82},
  annote =	 {They characterize a workload by the usual
                  instruction class usage frequencies
                  (dhrystone-like), cache and branch prediction hit
                  rates (for a specific configuration), and (new) by
                  the dynamic instruction distances (DID), i.e., the
                  number of instructions that a dependence spends. The
                  statistical simulator then generates a random
                  instruction stream with the same characteristics,
                  and executes this on a cycle-accurate simulator,
                  which converges after only a few thousand simulated
                  cycles. For the SPECint95 benchmarks on the
                  SimpleScalar architecture they get IPC values within
                  7\% of the original SimpleScalar values; lower
                  branch prediction and cache accuracies also lower
                  the accuracy of the statistical simulator. The DID
                  charasteristic seems to perform surprisingly
                  well. The paper then goes on to demonstrate how the
                  technique could be used for exploring design
                  tradeoffs, but without validation from a full
                  simulator I don't trust the results. The paper also
                  discusses the limits of their technique.}
}

@Proceedings{isca00,
  title = 	 "$27^\textit{th}$ Annual International Symposium on Computer Architecture",
  booktitle = 	 "$27^\textit{th}$ Annual International Symposium on Computer Architecture",
  year = 	 "2000",
  key =		 "ISCA 27",
}

@Article{bailey91,
  author =	 {David H. Bailey},
  title =	 {Twelve Ways to Fool the Masses When Giving
                  Performance Results on Parallel Computers},
  journal =	 {Supercomputing Review},
  year =	 {1991},
  pages =	 {54--55},
  month =	 aug,
  url =		 {http://www.pdc.kth.se/training/twelve-ways.html},
  annote =	 {Most of these ways are somewhat specific to
                  supercomputing, but you can probably learn lessons
                  applicable to other fields.}
}

@Article{zelkowitz&wallace98,
  author =	 {Marvin V. Zelkowitz and Dolores R. Wallace},
  title =	 {Experimental Models for Validating Technology},
  journal =	 ieeecomputer,
  year =	 {1998},
  volume =	 {31},
  number =	 {5},
  pages =	 {23--31},
  month =	 may,
  annote =	 {Presents a classification of software engineering
                  validation models and discusses them. They also
                  classified 612 papers in software engineering; the
                  most popular methods are no experimentation (167
                  papers) and Assertion (ad-hoc validation techniques
                  for the proposed technology, with danger of bias;
                  192 papers).}
}

@Article{tichy98,
  author =	 {Walter F. Tichy},
  title =	 {Should Computer Scientists Experiment More?},
  journal =	 ieeecomputer,
  year =	 {1998},
  volume =	 {31},
  number =	 {5},
  pages =	 {32--40},
  month =	 may,
  url =		 {http://wwwipd.ira.uka.de/~tichy/},
  annote =	 {Discusses the role experimentation should play in
                  computer science, and why some of the excuses given
                  for not doing it are invalid.}
}

@Article{tichy+95,
  author =	 {Walter F. Tichy and Paul Lukowicz and Lutz Prechelt
                  and Ernst A. Heinz},
  title =	 {Experimental Evaluation in Computer Science: A
                  Quantitative Study},
  journal =	 {Journal of Systems and Software},
  year =	 {1995},
  volume =	 {28},
  number =	 {1},
  pages =	 {9--18},
  month =	 jan,
  url =
                  {http://wwwipd.ira.uka.de/~prechelt/Biblio/1994-17.ps.gz},
  annote =	 {Classifies the papers in several journals and
                  conferences (non-CS: Optical Engineering, Neural
                  Computation; CS: TOCS, PLDI, TOPLAS, TSE, and a
                  random smaple of ACM papers) into Theory, Design,
                  Empirical, Hypothesis, and Other papers; for the
                  Design papers it classifies the amount of space
                  dedicated to the empirical validation. They observe
                  that the CS papers have a higher percentage of
                  design articles without empirical validation than
                  the non-CS papers (35\%--55\% vs. $<15$\%). They do
                  some error analysis, some of which is not convincing
                  (e.g., they use a confidence interval of 70\%
                  without justification). The paper is very good on
                  reproducability: it presents the papers used and the
                  classification used in the appendix.}
}

@InProceedings{johnson00,
  author =	 {David S. Johnson},
  title =	 {A Theoretician's Guide to the Experimental Analysis
                  of Algorithms},
  booktitle =	 {Dagstuhl Seminar on Experimental Algorithmics},
  year =	 {2000},
  month =	 sep,
  note =	 {An earlier version of \cite{johnson02}},
  annote =	 {Gives much useful advice on producing empirical
                  papers, most of it applicable beyond algorithm work,
                  including lists of pitfalls, suggestions, and pet
                  peeves. One important principle discussed is
                  comparability (as differentiated from
                  reproducability).}
}

@InProceedings{johnson02,
  author =	 {David S. Johnson},
  title =	 {A Theoretician's Guide to the Experimental Analysis
                  of Algorithms},
  booktitle =	 {Proceedings of the 5th and 6th DIMACS Implementation Challenges},
  year =	 {2002},
  url =		 {http://www.research.att.com/~dsj/papers/experguide.ps},
  annote =	 {Gives much useful advice on producing empirical
                  papers, most of it applicable beyond algorithm work,
                  including lists of pitfalls, suggestions, and pet
                  peeves. One important principle discussed is
                  comparability (as differentiated from
                  reproducability).}
}

@Article{sima00,
  author =	 {Dezs\"o Sima},
  title =	 {The Design Space of Register Renaming Techniques},
  journal =	 ieeemicro,
  year =	 {2000},
  volume =	 {20},
  number =	 {5},
  pages =	 {70--83},
  month =	 sep,
  annote =	 {Describes various design choices in hardware
                  register renaming and lists the choices
                  taken in many existing CPUs.}
}

@Article{hookway&herdeg97,
  author =	 {Raymond J. Hookway and Mark A. Herdeg},
  title =	 {{DIGITAL FX!32}: Combining Emulation and Binary
                  Translation},
  journal =	 {Digital Technical Journal},
  year =	 {1997},
  volume =	 {9},
  number =	 {1},
  pages =	 {3--12},
  url =          {http://research.compaq.com/wrl/DECarchives/DTJ/DTJP01/DTJP01P8.PS},
  annote =	 {Among other things, this paper describes software
                  pipelining the emulator loop; it also uses the first
                  two bytes of each instruction as an index into the
                  dispatch table (no word on D-cache miss rate, though).}
}

@Article{wulf&mckee95,
  author =	 {Wm. A. Wulf and Sally A. McKee},
  title =	 {Hitting the Memory Wall: Implications of the
                  Obvious},
  journal =	 can,
  year =	 {1995},
  volume =	 {23},
  number =	 {1},
  pages =	 {20--24},
  month =	 mar,
  url =          {ftp://ftp.cs.virginia.edu/pub/techreports/CS-94-48.ps.Z},
  annote =	 {This paper is mentioned frequently, probably
                  because it introduced the term \emph{memory
                  wall}. However, the central argument of the paper is
                  flawed. You can find a longer critique by me at
                  http://www.complang.tuwien.ac.at/anton/memory-wall.html.}
}

@PhdThesis{piumarta92,
  author =	 {Ian K. Piumarta},
  title =	 {Delayed Code Generation in a {Smalltalk-80}
                  Compiler},
  school =	 {University of Manchester},
  year =	 {1992},
  url =          {http://www.wolczko.com/mushroom/theses/piumarta.ps.gz},
  annote =	 {The main topic seems to be to replace peephole
                  optimization (and it's compile-time cost) with
                  delayed code generation to achieve similar code
                  quality. When compiling a node, instead of
                  generating code for moving the argument described by
                  a node to a register, delayed code generations just
                  creates a descriptor and leaves the optimal code
                  generation to the user of the node. This was not
                  original at the time; e.g., tree parsing code
                  generation (BEG/burg) does this, and even does it in
                  an optimal way.}
}

@InProceedings{hoffmann&o'donnell79,
  author =	 {Christoph M. Hoffmann and Michael J. O'Donnell},
  title =	 {An Interpreter Generator Using Tree Pattern
                  Matching},
  booktitle =	 {Principles of Programming Languages (POPL'79)},
  pages =	 {169--179},
  year =	 {1979},
  annote =	 {This paper sketches many ideas having to do with the
                  equational specification of an interpreter of a
                  functional language and the generation of a
                  tree-parsing interpreter from the specification.}
}

@InProceedings{card+94,
  author =	 {R\'emy Card and Theodore Ts'o and Stephen Tweedie},
  title =	 {Design and Implementation of the Second Extended
                  Filesystem},
  booktitle =	 {Proceedings of the First Dutch International
                  Symposium on Linux},
  OPTpages =	 {},
  year =	 {1994},
  isbn = {90-367-0385-9},
  url =	 {http://web.mit.edu/tytso/www/linux/ext2intro.html},
  OPTannote =	 {}
}

@Book{bell+90,
  author =	 {Timothy C. Bell and John G. Cleary and Ian
                  H. Witten},
  title =	 {Text Compression},
  publisher =	 {Prentice-Hall},
  year =	 {1990},
  ISBN =	 {0-13-911991-4},
  annote =	 {A very nice textbook on code compression covering
                  both theory and empirical work, as well as
                  discussing a lot of background topics. Some parts
                  take quite some work to comprehend, though.}
}

@Manual{intel01,
  title = 	 {Intel Pentium~4 Processor Optimization},
  organization = {Intel},
  year =	 {2001},
  OPTnote = 	 {},
  OPTannote = 	 {}
}

@InProceedings{hughes82,
  author =       "R. J. M. Hughes",
  title =        "Super-Combinators",
  booktitle =    "Conference Record of the 1980 LISP Conference,
                 Stanford, CA",
  pages =        "1--11",
  publisher =    "ACM",
  address =      "New York",
  year =         "1982",
  OPTannote =    {}
}

@Article{hinton+01,
  author =	 {Glenn Hinton and Dave Sager and Mike Upton and
                  Darrel Boggs and Doug Carmean and Alan Kyker and
                  Patrice Roussel},
  title =	 {The Microarchitecture of the Pentium~4 Processor},
  journal =	 {Intel Technology Journal},
  year =	 {2001},
  month =	 {Q1},
  url =		 {http://developer.intel.com/technology/itj/q12001/articles/art_2nav.htm},
  pdf-url =	 {http://developer.intel.com/technology/itj/q12001/pdf/art_2.pdf},
  OPTannote =	 {}
}

@TechReport{klaiber00,
  author =	 {Alexander Klaiber},
  title =	 {The Technology Behind {Crusoe} Processors},
  institution =	 {Transmeta Corporation},
  year =	 {2000},
  url =
                  {http://www.transmeta.com/pdf/white_papers/paper_aklaiber_19jan00.pdf},
  annote =	 {Gives an overview of Transmeta's processors, in
                  particular code morphing and discusses hardware
                  support for dealing with various problems in code
                  morphing (precise exceptions, aliases,
                  self-modifying code), and power management. For
                  self-modifying code, it write-protects pages that
                  contain code that has been translated; the paper
                  also hints at more sophisticated strategies.}
}

@InProceedings{ghiya+01,
  author =	 {Rakesh Ghiya and Daniel Lavery and David Sehr},
  title =	 {On the Importance of Points-To Analysis and Other
                  Memory Disambiguation Methods For {C} Programs},
  crossref =	 {sigplan01},
  pages =	 {47--58},
  annote =	 {This paper evaluates the effect of various memory
                  disambiguation techniques on the run-time (of
                  SPECint 2000 compiled with the Itanium
                  compiler). Unfortunately they turned off generating
                  code using data speculation techniques (including
                  run-time disambiguation) in all experiments, so the
                  results show higher speedups for static
                  disambiguation techniques than usual on that
                  platform. The paper also gives metrics like average
                  points-to set size, percentages of
                  \emph{independent}, \emph{maybe}, and
                  \emph{dependent} answers to disambiguation queries,
                  and which methods provided the answers. This data
                  allows to see if the indirect metrics used in much
                  of the pointer analysis literature correlate with
                  run-time; these results are mixed, so using indirect
                  metrics alone is not a good idea. The overall
                  speedup from diambiguation techniques is between 2\%
                  (181.mcf) and 26\% (256.bzip2), average 12\%. The
                  intra- and interprocedural points-to analyses
                  provide little speedup over simpler techniques
                  (exception: interprocedural points-to analysis
                  provides about 6\% speedup for 300.twolf); just
                  analysing globale variables for address-taken
                  (interprocedurally) gives significant speedups for
                  175.vpr, 176.gcc, 254.gap, 256.bzip2 and 300.twolf
                  and ``steals much of the thunder from
                  interprocedural points-to analysis''. One
                  interesting result is that the compiler can benefit
                  significantly from recognizing memory-allocation
                  routines (and conversely, performance can suffer if
                  the compiler does not recognize a user-defined
                  memory-allocation routine).}
}

@InProceedings{shaham+01,
  author =	 {Ran Shaham and Elliot K. Kolodner and Mooly Sagiv},
  title =	 {Heap Profiling for Space-Efficient {Java}},
  crossref =	 {sigplan01},
  pages =	 {104--113},
  annote =	 {Measures, in a Java environment, the drag-time, the
                  time between an object becoming dead (i.e., the last
                  dynamic use) and the object becoming unreachable
                  (and thus garbage-collectable), and how much it can
                  be reduced in various automatic ways.}
}

@InProceedings{evans&fraser01,
  author =	 {William S. Evans and Christopher W. Fraser},
  title =	 {Bytecode Compression via Profiled Grammar Rewriting},
  crossref =	 {sigplan01},
  pages =	 {148--155},
  annote =	 {Introduces a kind of two-level interpretation
                  scheme, where the actual interpreted program code is
                  a linearized representation of a parse tree for a
                  grammar that can generate all the VM code for a
                  basic block that the compiler can generate; so the
                  higher level of the interpreter interprets the
                  derivation, while the lower level interprets the VM
                  instructions given in the grammar rules. The 256
                  possible bytecodes for selecting the rule in the
                  grammar are utilized by enhancing the grammar with
                  (otherwise redundant) rules that encode frequently
                  occuring sequences of VM code; these rules are
                  produced by inlining rules for non-terminals. This
                  scheme provides more compression potential than
                  ordinary superinstructions because the nonterminal
                  provides additional context, and because the grammar
                  rules can still contain non-terminals (providing the
                  best of both superinstructions and
                  superoperators). This method compresses
                  lcc-generated bytecode down to about 40\%, with an
                  increase of 11KB in interpreter size (most of it as
                  grammar data). The resulting code is executable and
                  does not require extra memory for decompression. The
                  complete executable is about two thirds the size of
                  an lcc-generated native-code executable, but about
                  the same size as a MSVC-optimized executable; it
                  remains to be seen how well code produced by an
                  optimizing compiler would compress.}
}

@InProceedings{amme+01,
  author =	 {Wolfram Amme and Niall Dalton and Jeffery von Ronne
                  and Michael Franz},
  title =	 {SafeTSA: A Type Safe and Referentially Secure
                  Mobile-Code Representation Based on Static Single
                  Assignment Form},
  crossref =	 {sigplan01},
  pages =	 {137--147},
  annote =	 {The basic ideas in this representation are:
                  variables are named as the pair (distance in the
                  dominator tree, assignment within basic block);
                  variables are separated by type, with operations
                  referring only to variables of the right type (like
                  integer and FP instructions and registers in
                  assemblers); memory references use types to encode
                  that a null-pointer check and/or a range check has
                  already occured, allowing optimizing these
                  operations; the resulting code is encoded (using
                  text compression methods) in a way that supports
                  only correct code. These ideas are discussed mostly
                  in a general way, with some Java-specifics, but the
                  representation supposedly also supports Fortran95
                  and Ada95. The representation supports some CSE, but
                  not for address computation operations. The paper
                  also gives numbers on size (usually a little smaller
                  than Java bytecode), and some other static metrics,
                  especially wrt. the effect of optimizations.}
}

@InProceedings{appel&george01,
  author =	 {Andrew W. Appel and Lal George},
  title =	 {Optimal Spilling for {CISC} Machines with Few
                  Registers},
  crossref =	 {sigplan01},
  pages =	 {243--253},
  annote =	 {They divide the usual register allocation problem
                  into allocation and assignment. The allocation part
                  is solved optimally by modeling it as an integer
                  linear programming problem (cost function: cost of
                  spill and fill code and CISC memory instructions);
                  the assignment part is solved by inserting potential
                  parallel copies everywhere and then removing the
                  unnecessary ones by optimistic coalescing
                  \cite{park&moon98}; the parallel copies are
                  sequentialized without extra registers by using the
                  386 instruction xchg if necessary. The resulting
                  register allocation algorithm takes 30 times longer
                  than the base allocator they compare with, but
                  increases the execution speed of some benchmark
                  programs by 0\%--25.5% on a Pentium~II. The paper
                  also shortly describes some approaches they tried
                  and that did not work out, and gives an insightful
                  comparison with previous work.}
}

@InProceedings{hanson&proebsting01,
  author = 	 {David. R. Hanson and Todd A. Proebsting},
  title = 	 {Dynamic Variables},
  crossref =	 {sigplan01},
  pages =	 {264--273},
  annote =	 {}
}

@Proceedings{sigplan01,
  booktitle = 	 "SIGPLAN '01 Conference on Programming Language
		  Design and Implementation",
  title = 	 "SIGPLAN '01 Conference on Programming Language
		  Design and Implementation",
  year = 	 "2001",
  key = 	 "PLDI '01"
}

@TechReport{ding&zhong01,
  author =	 {Chen Ding and Yutao Zhong},
  title =	 {Reuse Distance Analysis},
  institution =	 {Computer Science department, University of
                  Rochester},
  year =	 {2001},
  number =	 {UR-CS-TR-741},
  month =	 feb,
  url =
                  {http://www.cs.rochester.edu/u/cding/Documents/Publications/TR741.ps},
  annote =	 {Introduces the term ``reuse distance'' (number of
                  references to distinct other memory items since the
                  last use); presents an efficient algorithm for
                  computing reuse distances (based on the counting
                  method of Bennett and Kruskal, not the stack method
                  of Mattson et~al.); discusses instrumenting programs
                  at the source level to record reuse distances;
                  presnts and discusses reuse distance histograms for
                  six Fortran~77 programs; shows the effect of
                  reuse-driven execution and reuse-based loop fusion
                  on the reuse distances; and presents timings for an
                  incomplete (wrt efficiency) implementation of the
                  algorithm.}
}

@Unpublished{vandrunen+01,
  author =	 {Thomas VanDrunen and Antony L. Hosking and Jens
                  Palsberg},
  title =	 {Reducing Loads and Stores in Stack Architectures},
  note =
                  {http://www.cs.purdue.edu/homes/palsberg/draft/vandrunen-hosking-palsberg00.ps.gz},
  year =	 {2001},
  annote =	 {Gives a nice overview of the previous work on stack
                  allocation etc. Then the paper presents a calculus
                  of transformations on straight-line, sequential code
                  (for a JVM-like stack machine). The manuscript of
                  Sepetember 30, 2001 does not yet explain what
                  problem it tries to solve, and does not have a
                  discussion of the result.}
}

@InProceedings{zibin&gil01,
  author =	 {Yoav Zibin and Joseph Gil},
  title =	 {Efficient Subtyping Tests with PQ-Encoding},
  crossref =	 {oopsla01},
  pages =	 {96--107},
  annote =	 {Gives a detailed overview of the type inclusion test
                  problem and the existing solutions. Then it extends
                  the relative numbering approach (where types are
                  represented as intervals of integer numbers) from
                  single inheritance to multiple inheritance. This is
                  not possible in general with a single encoding,
                  because it is not always possible to arrange the
                  type hierarchy in a way that all subtypes of each
                  type are adjacent. So the paper proposes slicing the
                  type graph into subgraphs for which the required
                  property holds, and having several encodings for the
                  types; this costs mainly a little space in each type
                  for holding the different encodings of the type, but
                  no additional instructions in the inclusion
                  test. The paper proposes a method for creating such
                  encodings by using PQ-trees, and evaluates it
                  empirically; it requires a little less space at
                  run-time (e.g. 16KB instead of 39KB for Eiffel4) and
                  a little more compile time than bit-packed encoding
                  (which is based on displays); no run-time or code
                  size numbers are presented. The paper is somewhat
                  hard to read, due to using hardly or not explained
                  terms, notation, and abbreviation, and too much
                  mathematical notation.}
}

@InProceedings{alpern+01,
  author =	 {Bowen Alpern and Anthony Cocchi and Stephen Fink and
                  David Grove},
  title =	 {Efficient Implementation of Java Interfaces:
                  Invokeinterface Considered Harmless},
  crossref =	 {oopsla01},
  pages =	 {108--124},
  annote =	 {Uses selector indexed tables for dispatching
                  interface calls; the twist here is that it does not
                  use the full table indexed by type and selector, nor
                  selector colouring (which cannot be done
                  incrementally) to reduce the size, but instead
                  allows conflicts in the table and uses conflict
                  resolution stubs to resolve them. The result is
                  called interface method tables (IMT). Also discusses
                  how to change some interface calls to virtual calls
                  (virtualization) based on data-flow analysis, and to
                  devirtualize and inline some of these calls, and
                  doing some of that in a guarded way. The empirical
                  evaluation shows a high frequency of virtual calls
                  in benchmarks; many of them can be optimized into
                  guarded virtual calls, either statically, or in a
                  profile-guided way. Different realistic interface
                  dispatch methods usually have less than 5\% impact,
                  but reach 30\% performance variation (in both
                  directions) in some benchmarks.}
}

@InProceedings{whaley01,
  author =	 {John Whaley},
  title =	 {Partial Method Compilation using Dynamic Profile
                  Information},
  crossref =	 {oopsla01},
  pages =	 {166-179},
  annote =	 {Profiles at the basic-block level; upon exceeding a
                  threshol;d, the method containing the basic block is
                  partially (i.e., active blocks only) compiled; this
                  saves compiling about half of the basic blocks,
                  depending on the threshold settings. Among the
                  optimizations performed are partial dead code
                  elimination, and escape analysis (for allocating
                  objects on the stack).}
}

@InProceedings{suganuma+01,
  author =	 {Toshio Suganuma and Toshiaki Yasue and Motohiro
                  Kawahito and Hideaki Komatsu and Toshio Nakatani},
  title =	 {A Dynamic Optimization Framework for a {Java}
                  Just-In-Time Compiler},
  crossref =	 {oopsla01},
  pages =	 {180--194},
  annote =	 {Describes a JVM system that starts out by
                  interpreting JVM code (with a threshold of 2000
                  executions of a basic block), and then compiles the
                  code per-method in three stages (quick optimization,
                  full optimization, and special optimization (with
                  specialization)). After interpretation, dynamically
                  installed and deinstalled instrumentation is used to
                  get profiles for further optimization.}
}

@InProceedings{pechtchanski&sarkar01,
  author =	 {Igor Pechtchanski and Vivek Sarkar},
  title =	 {Dynamic Optimistic Interprocedural Analysis},
  crossref =	 {oopsla01},
  pages =	 {195--210},
  annote =	 {They deal with some of the weaknesses of
                  interprocedural analysis by using dynamic
                  recompilation; this allows to use optimistic
                  assumption and recompile when they become invalid
                  (e.g., through dynamic class loading). The paper
                  presents an analysis framework and demonstrates it
                  with type analysis for devirtualization and inlining
                  as example application. The results of this
                  optimization are slightly better than pessimistic
                  analysis for method calls, and sometimes a lot
                  better for interface calls.}
}

@InProceedings{visser01,
  author =	 {Joost Visser},
  title =	 {Visitor Combination and Traversal Control},
  crossref =	 {oopsla01},
  pages =	 {270--282},
  annote =	 {Shows how to build general and flexible tree
                  traversals from a few simple combinators.}
}

@Proceedings{oopsla01,
  title =	 {Conference on Object-Oriented Programming, Systems,
                  Languages \& Applications (OOPSLA '01)},
  booktitle =	 {Conference on Object-Oriented Programming, Systems,
                  Languages \& Applications (OOPSLA '01)},
  year =	 {2001},
  key =		 {OOPSLA '98},
}

@PhdThesis{sugumar93,
  author =	 {Rabin A. Sugumar},
  title =	 {Multi-Configuration Simulation Algorithms for the
                  Evaluation of Computer Architecture Designs},
  school =	 {University of Michigan},
  year =	 {1993},
  url =		 {http://www.eecs.umich.edu/PPP/rabins-thesis.ps},
  abstract-url = {http://www.eecs.umich.edu/PPP/rabins-thesis.html},
  note =	 {Technical Report CSE-TR-173-93 with Santosh G. Abraham},
  annote =	 {}
}

@InProceedings{naessen+01,
  author =	 {Henrik N\"ass\'en and Mats Carlsson and Konstantinos
                  Sagonas},
  title =	 {Instruction Merging and Specialization in the
                  {SICStus Prolog} Virtual Machine},
  booktitle =	 {Principles and Practice of Declarative Programming
                  (PPDP01)},
  OPTpages =	 {},
  year =	 {2001},
  url =		 {http://www.csd.uu.se/%7Ekostis/Papers/sicstus.ps.gz},
  annote =	 {Gives an overview of various WAM optimization
                  techniques and then evaluates combining (merging)
                  pairs of instructions into (about 60)
                  superinstructions, specializing WAM instructions for
                  specific immediate arguments (in particular,
                  specific registers, for about 200 new instructions),
                  and a combination of both (for about 100 new
                  instructions). Instruction merging produces small
                  speedups (about 8\% on average), specialization
                  produces a small slowdown on average, and both
                  combined are about as fast as instruction merging
                  alone. VM code size is reduced by around 10\% with
                  these techniques, and the VM emulator size grows by
                  up to 15KB.}
}

@InProceedings{noel+98,
  author =	 {Fran\c{c}ois No\"el and Luke Hornof and Charles
                  Consel and Julia L. Lawall},
  title =	 {Automatic, Template-Based Run-Time Specialization:
                  Implementation and Experimantal Study},
  booktitle =	 {IEEE International Conference on Computer Languages
                  (ICCL '98)},
  pages =	 {123-142},
  year =	 {1998},
  url =
                  {http://compose.labri.fr/documentation/papers/rt_bench.ps.gz},
  annote =	 {This paper first gives a nice overview of partial
                  evaluation and specialization, then describes
                  implementation details of the Tempo run-time
                  specializer for C: at compile-time the specializer
                  produces C code for functions containing the
                  templates for the run-time generated code in an
                  unoptimizable context similar to the actual usage of
                  the templates (this ensures that the register
                  allocation etc.\ will be such that the templates fit
                  together). Template boundaries are registered using
                  labels-as-values and function pointers. Holes for
                  filling in constants or calls are created by
                  referencing external variables, and using linkage
                  information (through GNU BFD) to find the addresses
                  of the holes. Finally, the paper shows some
                  benchmark results; for several benchmarks run-time
                  specialization gives 70\%--90\% of the speed of
                  compile-time specialization, but for dot-product and
                  dithering, where the main benefit of compile-time
                  specialization comes from strength reduction,
                  changes in the source code of the benchmark were
                  required to achieve such results (these
                  transformations can probably be automated with
                  additional effort); the break-even point was quite
                  soon for all benchmarks (3--87 iterations), mostly
                  because the run-time specialization is quite quick
                  (just copying templates and filling holes), and most
                  of the work was already done at compile-time.}
}

@Book{adobe99,
  author = 	 {{Adobe Systems Incorporated}},
  title = 	 {PostScript Language --- Reference Manual},
  publisher = 	 {Addison-Wesley},
  year = 	 1999,
  edition =	 {third},
  url=		 {http://www.adobe.com/products/postscript/pdfs/PLRM.pdf}
}

@Book{conklin&rather97,
  author =	 {Edward K. Conklin and Elizabeth D. Rather},
  title = 	 {Forth Programmer's Handbook},
  publisher = 	 {Forth, Inc.},
  year = 	 {1997},
  isbn =	 {0-9662156-0-5},
  OPTannote = 	 {}
}

@InProceedings{shaw02,
  author =	 {Mary Shaw},
  title =	 {What Makes Good Research in Software Engineering?},
  booktitle =	 {Presented at the European Joint Conference of Theory
                  and Practice of Software (ETAPS 2002), Grenoble,
                  France. To appear in the International Journal on
                  Software Tools for Technology Transfer.},
  url =
                  {http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/compose/www/ftp/shaw-fin-etaps.pdf},
  OPTnote =	 {},
  annote =	 {Takes a look at various research strategies used in
                  software engineering.}
}

@Article{collberg02,
  author =	 {Christian S. Collberg},
  title =	 {Automatic derivation of compiler machine
                  descriptions},
  journal =	 toplas,
  year =	 {2002},
  volume =	 {24},
  number =	 {4},
  pages =	 {369--408},
  month =	 jul,
  annote =	 {Journal version of \cite{collberg97}. Extracts a
                  machine description (for an assembler-generating
                  code generator) out of a working C compiler by
                  compiling various test programs to assembly
                  language, and analysing the resulting
                  output. Presents various innovative techniques for
                  achieving this result.}
}

@Article{hoogerbrugge+99,
  author =       "Jan Hoogerbrugge and Lex Augusteijn and Jeroen Trum
                 and Rik van de Wiel",
  title =        "A code compression system based on pipelined
                 interpreters",
  journal =      spe,
  volume =       "29",
  number =       "11",
  pages =        "1005--1023",
  month =        sep,
  year =         "1999",
  OPTannote=     ""
}

@Book{jones+93,
  author =	 {Neil D. Jones and Carsten K. Gomard and Peter Sestoft},
  title = 	 {Partial Evaluation and Automatic Program Generation},
  publisher = 	 {Prentice Hall},
  year = 	 {1993},
  url =		 {http://www.dina.kvl.dk/~sestoft/pebook/},
  OPTannote = 	 {}
}

@TechReport{rossi&sivalingam96,
  author =	 {Markku Rossi and Kengatharan Sivalingam},
  title =	 {A Survey of Instruction Dispatch Techniques for
                  Byte-Code Interpreters},
  institution =	 {Faculty of Information Technology, Helsinki
                  University of Technology},
  year =	 {1996},
  number =	 {TKO-C79},
  month =	 may,
  url =		 {http://www.cs.hut.fi/~cessu/papers/dispatch.ps},
  annote =	 {Describes a number of interpreter dispatch
                  techniques and compares five dispatch techniques
                  empirically on five different machines (each with a
                  different architecture); unfortunately, the
                  benchmark is hardly described. The most remarkable
                  part about the paper is that it is the first to
                  propose the memcpy method for optimizing dispatch
                  (aka selective inlining \cite{piumarta&riccardi98}
                  or dynamic superinstructions); however, it does not
                  explain how to perform control flow in that method
                  (and their code example looks as if one cannot do
                  it); it mentions problems with non-relocatable code,
                  but does not provide a solution; it also mentions
                  dealing with immediate arguments using a data
                  pointer, but does not explain it in detail. The idea
                  is attributed to Kenneth Oksanen.}
}

@InProceedings{rakvic+02,
  author =	 {Ryan Rakvic and Ed Grochowski and Bryan Black and
                  Murali Annavaram and Trung Diep and John P. Shen},
  title =	 {Performance Advantage of the Register Stack in Intel
                  Itanium Processors},
  booktitle =	 {Workshop on Explicitly Parallel Instruction
                  Computing (EPIC) Architectures and Compiler
                  Techniques},
  OPTpages =	 {},
  year =	 {2002},
  url =
                  {http://systems.cs.colorado.edu/EPIC2/papers/s2-1-rakvic.pdf},
  annote =	 {Evaluates the IA64 register stack feature by
                  comparing versions with this feature and various
                  numbers of registers with a (simulated) version of
                  the architecture without this feature on 250M
                  instruction traces of a subset of the SPEC2K integer
                  benchmarks and on Oracle. The register stack engine
                  significantly reduces the number of loads and
                  stores; the save-restore traffic on the benchmarks
                  falls (on average) from 15,25\% to 3.92\% for 96
                  stack registers. The paper does not take the effect
                  of shrink-wrapping \cite{chow88} into account, but
                  on average, 76\% of the saved registers were used by
                  the actual control flow, so shrink-wrapping could
                  not work wonders anyway. The paper presents results
                  on the cache impact of the presence or absence of
                  the register stack (little overall impact). The
                  overall impact on performance for the Itanium~2-like
                  machine they simulated was 1.7\%--11.9\% (7\%
                  average) speedup with 96 stack registers and
                  slightly better with more. For a similar OoO machine
                  model the speedup was 10.2\% for 96 stack registers
                  and again a little more for more registers. To
                  explain the better speedup of the OoO model, the
                  paper presents data for the vitality (criticality)
                  of loads, and in the OoO model 10\% of the restore
                  loads have a dependence distance of 0 (compared to
                  2\% on the in-order model).}
}

@Article{tuomi02,
  author =	 {Ilkka Tuomi},
  title =	 {The Lives and Death of {Moore}'s Law},
  journal =	 {First Monday},
  year =	 {2002},
  volume =	 {7},
  number =	 {11},
  url =
                  {http://firstmonday.org/issues/issue7_11/tuomi/index.html},
  annote =	 {Takes a closer look at the various versions of
                  Moore's law (as written by Moore and as reported by
                  others), and provides empirical evidence of the
                  untruth of all of these versions. The final section
                  claims that ``references to Moore's Law
                  qualitatively miss the character of development in
                  semiconductor technology and information society'';
                  this section did not convince me, though.}
}

@Book{warren03,
  author =	 {Henry S. Warren, Jr.},
  title =	 {Hacker's Delight},
  publisher =	 {Addison-Wesley},
  year =	 {2003},
  annote =	 {A collection of computer arithmetic and bit-fiddling
                  stuff; the style is quite dry, so this is not very
                  appropriate for fun reading, more as a reference
                  work. In any case, every compiler writer and
                  micro-optimizer should know it.}
}

@InProceedings{hartstein&puzak02,
  author =	 {A. Hartstein and Thomas R. Puzak},
  title =	 {The Optimum Pipeline Depth for a Microprocessor},
  crossref =	 {isca02},
  pages =	 {7--13},
  annote =	 {Presents a formula for the performance of a
                  microprocessor when varying the pipeline length; the
                  optimum pipeline length can be derived from
                  this. Unfortunately there are two parameters in the
                  formulae that depend on the microarchitecture and
                  the workload, and these parameters cannot be
                  determined analytically, only empirically. The paper
                  also presents data from runs of a simulator with
                  different pipeline lengths, and different (but
                  hardly specified) workloads. The results match with
                  curves from the formula (after matching for the
                  missing parameters). One interesting result was that
                  the SPEC workloads had a shorter optimum pipeline
                  length than the other workloads used in the paper.}
}

@InProceedings{hrishikesh+02,
  author =	 {M. S. Hrishikesh and Norman P. Jouppi and Keith
                  I. Farkas and Doug Burger and Stephen W. Keckler and
                  Premkishore Shivakumar},
  title =	 {The Optimal Logic Depth per Pipeline Stage is 6 to 8
                  FO4 Inverter Delays},
  crossref =	 {isca02},
  pages =	 {14--24},
  annote =	 {This paper takes a low-level simulator of the 21264,
                  varies the number of pipeline stages, uses this to
                  run a number of workloads (actually only traces from
                  them), and reports performance results for
                  them. With a latch overhead of about 2 FO4
                  inverters, the optimal pipeline stage length is
                  about 8 FO4 inverters (with work-load-dependent
                  variations). Discusses various issues involved in
                  quite some depth. In particular, this paper
                  discusses how to pipeline the instruction window
                  design (which has been identified as a bottleneck in
                  earlier papers).}
}

@InProceedings{sprangle&carmean02,
  author =	 {Eric Sprangle and Doug Carmean},
  title =	 {Increasing Processor Performance by Implementing
                  Deeper Pipelines},
  crossref =	 {isca02},
  pages =	 {25--34},
  url = {http://systems.cs.colorado.edu/ISCA2002/FinalPapers/Deep%20Pipes.pdf},
  annote =	 {This paper starts with the Williamette (Pentium~4)
                  pipeline and discusses and evaluates changes to the
                  pipeline length. In particular, it gives numbers on
                  how lengthening various latencies would affect IPC;
                  on a per-cycle basis the ALU latency is most
                  important, then L1 cache, then L2 cache, then branch
                  misprediction; however, the total effect of
                  lengthening the pipeline to double the clock rate
                  gives the reverse order (because branch
                  misprediction gains more cycles than the other
                  latencies). The paper reports 52 pipeline stages
                  with 1.96 times the original clock rate as optimal
                  for the Pentium~4 microarchitecture, resulting in a
                  reduction of 1.45 of core time and an overall
                  speedup of about 1.29 (including waiting for
                  memory). Various other topics are discussed, such as
                  nonlinear effects when introducing bypasses, and
                  varying cache sizes.  Recommended reading.}
}

@InProceedings{ernst&austin02,
  author =	 {Dan Ernst and Todd Austin},
  title =	 {Efficient Dynamic Scheduling Through Tag
                  Elimination},
  crossref =	 {isca02},
  pages =	 {37--46},
  annote =	 {Propose and evaluate two methods to reduce the
                  number of tag comparators necessary for a given
                  instruction window size in an OoO CPU scheduler:
                  Having special tag-reduced slots for instructions
                  where one operand is already available (e.g.,
                  because it is an immediate operand); and predicting
                  which tag will be the last one to become
                  available. Both techniques reduce IPC by a little,
                  but reduce the critical path time through the
                  scheduler by more, and also reduce the energy
                  consumption.}
}

@InProceedings{fields+02,
  author =	 {Brian Fields and Rastislav Bodik and Mark D. Hill},
  title =	 {Slack: Maximizing Performance under Technological
                  Constraints},
  crossref =	 {isca02},
  pages =	 {47--58},
  annote =	 {The idea here is to use the slack that is present on
                  some dependence paths to use slower and cheaper (in,
                  e.g. energy consumption) resources. his paper
                  explores this topic quite well. It discusses several
                  slack concepts (local, global, and apportioned) and
                  how to measure them, and presents some results on
                  some SPEC codes. There are quite a lot of
                  instructions that have quite a bit of slack: on
                  average, 75\% of the instructions can be apportioned
                  a slack of 5 cycles or more. The paper discusses how
                  to predict slack to make use of this fact, and
                  evaluates how various 6-wide microarchitectures with
                  such a slack predictor would fare (some of them
                  quite well).}
}

@InProceedings{kim&smith02,
  author =	 {Ho-Seop Kim and James E. Smith},
  title =	 {An Instruction Set and Microarchitecture for
                  Instruction Level Distributed Processing},
  crossref =	 {isca02},
  pages =	 {71--81},
  url =          {http://www.ece.wisc.edu/~hskim/papers/kimh_ildp.pdf},
  annote =	 {This paper addresses the problems of wide
                  superscalars with communication across the chip and
                  the number of write ports in the register file. The
                  authors propose an architecture (ILDP) with
                  general-purpose registers and with accumulators
                  (with instructions only accessing one accumulator
                  (read and/or write) and one register (read or
                  write); for the accumulators their death is
                  specified explicitly in the instructions. The
                  microarchitecture builds \emph{strands} from
                  instructions working on an accumulator; a strand
                  starts with an instruction writing to an accumulator
                  without reading from it, continues with instructions
                  reading from (and possibly writing to) the
                  accumulator and ends with an instruction that kills
                  the accumulator. Strands are allocated to one out of
                  eight processing elements (PEs) dynamically (i.e.,
                  accumulators are renamed). A PE consists of
                  mainly one ALU data path (but also a copy of the
                  GPRs and an L1 cache). They evaluated this
                  architecture by translating Alpha binaries into it,
                  and comparing their architecture to a 4-wide or
                  8-wide Alpha implementation; their architecture has
                  a lower L1 cache latency, though. The performance of
                  ILDP in clock cycles is competetive, and one can
                  expect faster clocks for ILDP. The paper also
                  presents data for other stuff, e.g. general-purpose
                  register writes, which have to be promoted between
                  strands and which are relatively few.}
}

@InProceedings{lewis+02,
  author =	 {Jarrod A. Lewis and Bryan Black and Mikko
                  H. Lipasti},
  title =	 {Avoiding Initialization Misses on the Heap},
  crossref =	 {isca02},
  pages =	 {183--194},
  annote =	 {Gives measurements on the memory traffic arising
                  from freshly allocated heap areas (23\% with a 2MB
                  cache), and proposes a hardware scheme for avoiding
                  it.}
}

@Proceedings{isca02,
  title = 	 "$29^\textit{th}$ Annual International Symposium on Computer Architecture",
  booktitle = 	 "$29^\textit{th}$ Annual International Symposium on Computer Architecture",
  year = 	 "2002",
  key =		 "ISCA 29",
}

@TechReport{johnson&ritchie81,
  author =	 {Steve C. Johnson and Dennis M. Ritchie},
  title =	 {The C Language Calling Sequence},
  institution =	 {Bell Laboratories},
  year =	 {1981},
  type =	 {Computing Science Technical Report},
  number =	 {102},
  html-url =	 {http://cm.bell-labs.com/cm/cs/who/dmr/clcs.html},
  ps-url =	 {http://cm.bell-labs.com/cm/cs/who/dmr/clcs.ps},
  annote =	 {Detailed discussion of calling conventions in
                  general and C calling conventions in particular}
}

@InProceedings{ogata+02,
  author =	 {Kazunori Ogata and Hideaki Komatsu and Toshio
                  Nakatani},
  title =	 {Bytecode Fetch Optimization for a {Java}
                  Interpreter},
  crossref =	 {asplos02},
  pages =	 {58--67},
  annote =	 {The paper presents a Java Bytecode interpreter for
                  the PowerPC architecture and some optimizations and
                  evaluates them: stack caching (a few variations),
                  position-based handler customization and
                  position-based speculative decoding (software
                  pipelining of the interpreter). Position-based
                  handler customization deals with different
                  alignments of bytecodes by having four states in the
                  interpreter for the different alignments, each state
                  with its own specialized copy of the
                  interpreter. For stack caching they evaluated a
                  fixed one-TOS-register organization with
                  write-through caching (5.6\% speedup over base), and
                  dynamic stack caching with two registers (3 states,
                  7/9% speedup over base), and used the write-through
                  organization for further experiments; write-through
                  is not compared empirically to
                  write-back. Position-based handler customization
                  buys another 19\%, and software pipelining an
                  additional 3.4\%. The paper also presents results on
                  memory traffic (both I and D).}
}


@InProceedings{02,
  author = 	 {},
  title = 	 {},
  crossref =	 {asplos02},
  pages =	 {},
  annote =	 {}
}

@Proceedings{asplos02,
  title = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-X)",
  booktitle = 	 "Architectural Support for Programming Languages and
		  Operating Systems (ASPLOS-X)",
  year = 	 "2002",
  key =		 "ASPLOS-X"
}

@InProceedings{stoddart02,
  author =	 {Bill Stoddart},
  title =	 {Efficient ``Reversibility'' with Guards and Choice},
  crossref =	 {euroforth02},
  pages =	 {3--15},
  url =          {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/bill.rev.ps.gz},
  abstract =	 {We describe reversibility mechanisms incorporated
                  into a native code Forth used an an intermediate
                  language for a B-GSL compiler. In contrast to our
                  previous work, information preservation is limited
                  to what is needed to implement the B-GSL semantics
                  for non-deterministic choice and guard. Design
                  choices are discussed with reference to the Pentium
                  architecture. The use of guards and choice in Forth
                  is illustrated with the Knight's Tour.}
}

@InProceedings{stoddart&zeyda02,
  author =	 {Bill Stoddart and Frank Zeyda},
  title =	 {Implementing Sets for Reversible Computation},
  crossref =	 {euroforth02},
  pages =	 {16--23},
  url =          {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/bill.sets.ps.gz},
  abstract =	 {Sets provide a very general tool for representing
                  information and modelling the behaviour of
                  systems. We consider their implementation and
                  associated problems of garbage collection in the
                  context of reversible computation. We describe our
                  implementation technique, which uses ordered arrays,
                  and discuss scalability of performance.}
}

@InProceedings{gregg&waldron02,
  author =	 {David Gregg and John Waldron},
  title =	 {Primitive Sequences in General Purpose {Forth}
                  Programs},
  crossref =	 {euroforth02},
  pages =	 {24--32},
  url =          {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/gregg.ps.gz},
  note =	 {Refereed},
  abstract =	 {Instruction dispatch is responsible for most of the
                  running time of Forth interpreters, especially on
                  modern pipelined processors. Superinstructions are
                  an important optimisation to reduce the number of
                  instruction dispatches. Superinstructions have been
                  used for many years to optimise interpreters, but an
                  open problem is the choice of superinstructions to
                  include in the interpreter. In this paper we propose
                  a number of heuristics for choosing
                  superinstructions, and evaluate them for general
                  purpose Forth programs. We find that static measures
                  of frequency perform well for superinstruction
                  selection. As few as eight superinstructions can
                  reduce the number of instruction dispatches by an
                  average of 15\%, and reductions of up to 45\% are
                  possible with large numbers of superinstructions.}
}

@InProceedings{ertl02ef,
  author = 	 {M. Anton Ertl},
  title = 	 {The Evolution of Vmgen},
  crossref =	 {euroforth02},
  pages =	 {33--37},
  url =		 {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ertl.ps.gz},
  note =	 {Slides}
}

@InProceedings{poial02,
  author =	 {Jaanus P\"oial},
  title =	 {Stack Effect Calculus with Typed Wildcards,
                  Polymorphism and Inheritance},
  crossref =	 {euroforth02},
  pages =	 {38},
  slides-url =   {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/poial.ps.gz},
  abstract-url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/poial.txt},
  note =	 {Abstract in hardcopy proceedings},
  abstract =	 {In early 1990s author introduced a formal stack
                  effect calculus for verification of compilers that
                  translated high level languages (Fortran, Modula)
                  into Forth, see [PST90],[P90a],[P90h]. The calculus
                  was partially applicable to static type checking of
                  Forth programs, but this was not the primary goal
                  these days. Stack effects (formal specifications of
                  input and output parameters for stack operations)
                  were defined using flat type space where different
                  types were considered incompatible and no subtyping
                  or inheritance was allowed. The so called wildcard
                  types were introduced by sets of stack effects, see
                  [P91]. This framework does not suite well with
                  abstract stack machines that use principles of
                  object orientation (see, for example, [AG98] about
                  type checking in Java Virtual Machine). Peter Knaggs
                  and Bill Stoddart improved the type signature
                  algebra and introduced a lot of useful things (type
                  variables, subtyping, reference types, wildcards,
                  etc.), see [SK93], [K93].\par In this presentation a
                  modified framework for type checking is proposed to
                  support typed wildcards and inheritance. Now it is
                  possible to perform little more exact type
                  calculations and express polymorphic
                  operations. Every type symbol has its place in the
                  type hierarchy and, at the same time, it may be
                  treated as a wildcard symbol. Earlier approaches
                  matched wildcards to concrete symbols (resulting in
                  this concrete symbol) or to other wildcards
                  (resulting in a new wildcard); this approach is more
                  general allowing stepwise refinement of types. Not
                  only the type checking is target here, but also the
                  (static) choice of  the right version for
                  polymorphic operations (known as method overloading
                  in object oriented languages). Given a type
                  hierarchy, formal specifications for operations and
                  a program we can refine the type signatures in the
                  program  according to the context where an operation
                  appears. Experimental implementation of this
                  framework is in progress.}
}

@InProceedings{ceballos02udp,
  author = 	 {Federico de Ceballos},
  title = 	 {UDP/IP over {Ethernet} for 8-Bit Microcontrollers},
  crossref =	 {euroforth02},
  url =		 {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-udp.ps.gz},
  pdf-url =      {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-udp.pdf},
  note =         {Late paper, not in hard copy}
}

@InProceedings{ceballos02qnx,
  author = 	 {Federico de Ceballos},
  title = 	 {Forth for the {QNX} Realtime Platform},
  crossref =	 {euroforth02},
  url =		 {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-qnx.ps.gz},
  pdf-url =	 {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-qnx.pdf},
  note =	 {Late paper, not in hard copy}
}

@InProceedings{ertl02efb,
  author = 	 {M. Anton Ertl},
  title = 	 {Superinstructions in {Gforth}},
  crossref =	 {euroforth02},
  note =	 {Demonstration only, no paper}
}

@Proceedings{euroforth02,
  title = 	 {18th EuroForth Conference},
  booktitle = 	 {18th EuroForth Conference},
  year = 	 {2002},
  key =		 {EuroForth'02},
  editor =	 {M. Anton Ertl}
}

@InProceedings{pelc98,
  author =	 {Stephen Pelc},
  title =	 {The {MPE} {VFX} {Forth} Code Generator},
  booktitle =	 {EuroForth '98},
  year =	 {1998},
  url =          {http://dec.bournemouth.ac.uk/forth/euro/ef98/pelc98.pdf},
  annote =	 {Contains very little technical information, but
                  gives a nice example of the resulting code quality.}
}

@InProceedings{yaghmour&dagenais00,
  author =	 {Karim Yaghmour and Mchel R. Dagenais},
  title =	 {Measuring and Characterizing System Behaviour Using
                  Kernel-Event Logging},
  crossref =	 {usenix00},
  pages =	 {13--26},
  annote =	 {Describes the Linux Trace Toolkit (LTT).}
}

@InProceedings{roselli+00,
  author =	 {Drew Roselli and Jacob R. Lorch and Thomas
                  E. Anderson},
  title =	 {A Comparison of File System Workloads},
  crossref =	 {usenix00},
  pages =	 {41--54},
  annote =	 {They collected file system traces from four
                  different environments (3 Unix, 1 WNT), and evaluate
                  them. Some of the more interesting results are:
                  relatively small caches (16MB or so) capture most of
                  the cacheable read traffic; most files are either
                  read-mostly or write-mostly; a write delay of 30s is
                  not very effective at reducing write bandwidth (even
                  ignoring syncs); large files are often accessed
                  randomly.}
}

@InProceedings{zadok&nieh00,
  author =	 {Erez Zadok and Jason Nieh},
  title =	 {FiST: A Language for Stackable File Systems},
  crossref =	 {usenix00},
  pages =	 {55--70},
  annote =	 {Stackable file systems allow features to be added to
                  existing file systems (e.g., engryption, unions,
                  access control). This paper presents a language,
                  library, and system for writing stackable file
                  systems portably (the system currently supports
                  Solaris, FreeBSD, and Linux).}
}

@InProceedings{seltzer+00,
  author =	 {Margo I. Seltzer and Gregory R. Ganger and M. Kirk
                  McKusick and Keith A. Smith and Craig A. N. Soules
                  and Christopher A. Stein},
  title =	 {Journaling vs. Soft Updates: Asynchronous Meta-Data
                  Protection in File Systems},
  crossref =	 {usenix00},
  pages =	 {71--84},
  annote =	 {Compares the features of two different journaling
                  file systems built upon FFS and FFS with soft
                  updates qualitatively, and compares their
                  performance quantitatively (as well as with FFS with
                  synchronous and with asynchronous metadata
                  updates). This paper also explains how Soft Updates
                  deals with the problem of having two changes to the
                  same block, which can lead to having cycles in the
                  dependences of the blocks.}
}

@InProceedings{wong&seltzer00,
  author =	 {Alexander Ya-Li and Margo Seltzer},
  title =	 {Operating System Support for Multi-User, Remote,
                  Graphical Interaction},
  crossref =	 {usenix00},
  pages =	 {183--196},
  annote =	 {Presents measurements on the performance (especially
                  latency, but also bandwidth requirements) of X (on
                  Linux) and WNT TSE when displaying on remote
                  displays. Data are presented on latencies resulting
                  from CPU load and scheduling schemes (interestingly,
                  Linux~2.0.36 with it's simple scheduler performed
                  much better than WNT, which has a complex scheduler
                  for favouring interactive tasks), on memory usage
                  and the effect on latency (no surprises here), and
                  on network performance (here, TSE's RDP protocol
                  proved to be more bandwidth efficient than X and its
                  compressed version LBX; a sufficiently large bitmap
                  cache in the display would a lot (especially in the
                  context of animated GIFs).}
}

@InProceedings{engelschall00,
  author =	 {Ralf S. Engelschall},
  title =	 {Portable Multithreading: The Signal Stack Trick for
                  User-Space Thread Creation},
  crossref =	 {usenix00},
  pages =	 {239--249},
  annote =	 {Explains in detail how to implement a
                  POSIX-compatible thread library with commonly
                  available ANSI C, SUSv2, and POSIX functions. The
                  main problem is the thread initialization. The
                  approach described in the paper is used in the GNU
                  Potrable Threads (Pth) library.}
}

@InProceedings{brown&patterson00,
  author =	 {Aaron Brown and David A. Patterson},
  title =	 {Towards Availability Benchmarks: A Case Study of
                  Software {RAID} Systems},
  crossref =	 {usenix00},
  pages =	 {263--276},
  annote =	 {First discusses the methodology of availability
                  benchmarks (very interesting) and then gives an
                  example of how to apply such a methodology to RAID
                  systems on different OSs (Solaris, Linux, WNT).}
}

@InProceedings{smaragdakis&wilson00,
  author =	 {Yannis Smaragdakis and Paul Wilson},
  title =	 {Performing Replacement in Modem Pools},
  crossref =	 {usenix00},
  pages =	 {277--291},
  annote =	 {The problem discussed here is which
                  supposedly-inactive modem connection to drop if
                  another user calls in and all modems are in use. The
                  paper presents the CIRG (conditional inter-reference
                  gap) algorithm, which performs a little better than
                  LRU.}
}

@Proceedings{usenix00,
  title = 	 {Usenix Annual Technical Conference},
  booktitle = 	 {Usenix Annual Technical Conference},
  year = 	 {2000},
  key =		 {Usenix '00}
}

@InProceedings{adya+02,
  author =	 {Atul Adya and William J. Bolosky and Miguel Castro
                  and Gerald Cermak and Ronnie Chaiken and John
                  R. Douceur and Jon Howell and Jacob R. Lorch and
                  Marvin Theimer and Roger P. Wattenhofer},
  title =	 {{FARSITE}: Federated, Available, and Reliable
                  Storage for an Incompletely trusted Environment},
  crossref =	 {osdi02},
  pages =	 {1--14},
  annote =	 {A distributed file system for WANs consisting of
                  untrusted systems with a design goal of 100,000
                  machines. The machines are assumed to be typical
                  PC/desktop machines, and the users are assumed to
                  behave like desktop users (in particular, concurrent
                  distributed write access is rare, and usually not
                  very massive). The system is based on groups of
                  byzantine-fault-tolerant machines (up to 1/3rd of
                  the machines can fail without the system failing)
                  for serving subtrees of the global directory tree;
                  these \emph{directory groups} are a central concept
                  in FARSITE. Encrypted files with local caching and
                  log-based update mechanisms are used for individual
                  files, with leases for various access types granted
                  by the directory group. The paper discusses various
                  aspects of the design in depth and provides some
                  performance numbers.  Recommended reading.}
}

@InProceedings{saito+02,
  author =	 {Yasushi Saito and Christos Karamanolis and Magnus
                  Karlsson and Mallik Mahalingam},
  title =	 {Taming Aggressive Replication in the {Pangaea}
                  Wide-Area File System},
  crossref =	 {osdi02},
  pages =	 {15--30},
  annote =	 {Pangaea is a distributed file system for a WAN of
                  trusted (as of this paper) PC-type computers. File
                  replicas are used for providing good access
                  performance and availability. The paper discusses in
                  depth how replicas are managed, in particular how
                  updates and replicas are distributed efficiently
                  without requiring special administration. Pangaea
                  uses optimistic update, and thus has the potential
                  for update conflicts. The paper contains a detailed
                  empirical evaluation.}
}

@InProceedings{muthitacharoen+02,
  author =	 {Athicha Muthitacharoen and Robert Morris and Thomer
                  M. Gil and Benjie Chen},
  title =	 {Ivy: A Read/Write Peer-to-Peer File System},
  crossref =	 {osdi02},
  pages =	 {31--44},
  annote =	 {A distributed file system based on the DHash
                  peer-to-peer block storage system, without full
                  trust. The file system is a kind of distributed
                  version of a log-structured file system (cleaning is
                  hardly discussed in the paper), with a file system
                  consisting of several logs, one log per update
                  source; users can decide to use (trust) different
                  logs, resulting in different views of the file
                  system, but cooperating users should use the same
                  view. Snapshots are used to avoid having to traverse
                  the log for older data. The paper discusses various
                  aspects and provides performance data.}
}

@InProceedings{kumar&li02,
  author =	 {Sanjeev Kumar and Kai Li},
  title =	 {Using Model Checking to Debug Device Firmware},
  crossref =	 {osdi02},
  pages =	 {61--74},
  annote =	 {Describes the domain-specific language ESP for
                  writing device-drivers, how it supports model
                  checking, and experiences with model-checking device
                  drivers.}
}

@InProceedings{musuvathi+02,
  author =	 {Madanlal Musuvathi and David Y. W. Park and Andy
                  Chou and Dawson R. Engler and David L. Dill},
  title =	 {{CMC}: A Programatic Approach to Model Checking Real
                  Code},
  crossref =	 {osdi02},
  pages =	 {75--88},
  annote =	 {Describes a model checker for C and experiences in
                  using it. One point that the paper makes is that
                  while the model checker may not terminate in
                  practical time (and thus prove correctness or at
                  least produce an exhaustive list of bugs), it does
                  produce some bug reports. Also, the paper observes
                  that correct programs take much less time to
                  model-check, so after removing the bugs that the
                  model-checker finds, it might terminate.}
}

@InProceedings{02,
  author = 	 {},
  title = 	 {},
  crossref =	 {osdi02},
  pages =	 {},
  annote =	 {}
}

@InProceedings{02,
  author = 	 {},
  title = 	 {},
  crossref =	 {osdi02},
  pages =	 {},
  annote =	 {}
}

@InProceedings{02,
  author = 	 {},
  title = 	 {},
  crossref =	 {osdi02},
  pages =	 {},
  annote =	 {}
}

@InProceedings{02,
  author = 	 {},
  title = 	 {},
  crossref =	 {osdi02},
  pages =	 {},
  annote =	 {}
}

@InProceedings{02,
  author = 	 {},
  title = 	 {},
  crossref =	 {osdi02},
  pages =	 {},
  annote =	 {}
}

@InProceedings{02,
  author = 	 {},
  title = 	 {},
  crossref =	 {osdi02},
  pages =	 {},
  annote =	 {}
}

@Proceedings{osdi02,
  title = 	 {Operating Systems Design and Implementation (OSDI '02)},
  booktitle = 	 {Operating Systems Design and Implementation (OSDI '02)},
  year = 	 {2002},
  key =		 {OSDI '02}
}

@TechReport{moudgill&moreno96,
  author =	 {Mayan Moudgill and Jaime Moreno},
  title =	 {Run-Time Detection and Recovery From Incorrectly
                  Reordered Memory Operations},
  institution =	 {IBM},
  year =	 {1996},
  number =	 {RC20857},
  abstract-url = {http://domino.watson.ibm.com/library/CyberDig.nsf/0/12a089effaf3a918852565930072a0db?OpenDocument},
  url =          {http://domino.watson.ibm.com/library/CyberDig.nsf/papers/12A089EFFAF3A918852565930072A0DB/%24File/8692.ps.gz},
  annote =	 {Propose a new method for the run-time disambiguation
                  of aliases: move the load up across the store(s),
                  and after the last store, load from the address
                  again, and compare the loaded value with the value
                  produced by the moved-up load; if they are equal, no
                  destructive aliasing occured, and there is no need
                  to execute compensation code. This method does not
                  require overhead proportional to loads*stores,
                  unlike \cite{nicolau89}, and also does not require
                  special hardware \cite{gallagher+94}. The paper also
                  discusses how to integrate this transformation in a
                  compiler and gives some results.}
}

@InProceedings{ramsey03,
  author =	 {Norman Ramsey},
  title =	 {Embedding an Interpreted Language Using Higher-Order
                  Functions and Types},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {6--14},
  year =	 {2003},
  url1 =         {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/ramsey.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858571&type=pdf},
  abstract =	 {Using an embedded, interpreted language to control a
                  complicated application can have significant
                  software-engineering benefits. But existing
                  interpreters are designed for embedding into C
                  code. To embed an interpreter into a different
                  language requires a suitable API. Lua-ML is a new
                  API that uses higher-order functions and types to
                  simplify the use of an embedded interpreter. A
                  typical application-program function can be added to
                  a Lua-ML interpreter simply by describing the
                  function's type.}
}

@InProceedings{liu&moore03,
  author =	 {Hanbing Liu and J. Strother Moore},
  title =	 {Executable JVM Model for Analytical Reasoning: A
                  Study},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {15--23},
  year =	 {2003},
  url1 =         {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/liu.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858572&type=pdf},
  abstract =	 {To study the properties of the Java Virtual
                  Machine(JVM) and Java programs, our research group
                  has produced a series of JVM models written in a
                  functional subset of Common Lisp. In this paper, we
                  present our most complete JVM model from this
                  series, namely, M6, which is derived from a careful
                  study of the J2ME KVM[16] implementation.}
}

@InProceedings{franz+03,
  author =	 {Michael Franz and Deepak Chandra and Andreas Gal and
                  Vivek Haldar and Fermin Reig and Ning Wang},
  title =	 {A Portable Virtual Machine Target for Proof-Carrying
                  Code},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {24--31},
  year =	 {2003},
  url1 =	 {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/franz.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858573&type=pdf},
  abstract =	 {Virtual Machines (VMs) and Proof-Carrying Code (PCC)
                  are two techniques that have been used independently
                  to provide safety for (mobile) code. Existing
                  virtual machines, such as the Java VM, have several
                  drawbacks: First, the effort required for safety
                  verification is considerable. Second and more
                  subtly, the need to provide such verification by the
                  code consumer inhibits the amount of optimization
                  that can be performed by the code producer. This in
                  turn makes just-in-time compilation surprisingly
                  expensive. Proof-Carrying Code, on the other hand,
                  has its own set of limitations, among which are the
                  sizes of the proofs and the fact that the certified
                  code is no longer machine-independent. In this
                  paper, we describe work in progress on combining
                  these approaches. Our hybrid safe-code solution uses
                  a virtual machine that has been designed
                  specifically to support proof-carrying code, while
                  simultaneously providing efficient just-in-time
                  compilation and target-machine independence. In
                  particular, our approach reduces the complexity of
                  the required proofs, resulting in fewer proof
                  obligations that need to be discharged at the target
                  machine. }
}

@InProceedings{lattendresse&feeley03,
  author =	 {Mario Latendresse and Marc Feeley},
  title =	 {Generation of Fast Interpreters for {Huffman}
                  Compressed Bytecode},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {32--40},
  year =	 {2003},
  url1 =	 {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/latendresse.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858574&type=pdf},
  abstract =	 {Embedded systems often have severe memory
                  constraints requiring careful encoding of
                  programs. For example, smart cards have on the order
                  of 1K of RAM, 16K of non-volatile memory, and 24K of
                  ROM. A virtual machine can be an effective approach
                  to obtain compact programs but instructions are
                  commonly encoded using one byte for the opcode and
                  multiple bytes for the operands, which can be
                  wasteful and thus limit the size of programs
                  runnable on embedded systems. Our approach uses
                  canonical Huffman codes to generate compact opcodes
                  with custom-sized operand fields and with a virtual
                  machine that directly executes this compact code. We
                  present techniques to automatically generate the new
                  instruction formats and the decoder. In effect, this
                  automatically creates both an instruction set for a
                  customized virtual machine and an implementation of
                  that machine. We demonstrate that, without prior
                  decompression, fast decoding of these virtual
                  compressed instructions is feasible. Through
                  experiments on Scheme and Java, we demonstrate the
                  speed of these decoders. Java benchmarks show an
                  average execution slowdown of 9%. Compression
                  factors highly depend on the original bytecode and
                  the training sample, but typically vary from 30% to
                  60%. }
}

@InProceedings{davis+03,
  author =	 {Brian Davis and Andrew Beatty and Kevin Casey and
                  David Gregg and John Waldron},
  title =	 {The Case for Virtual Register Machines},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {41--49},
  year =	 {2003},
  url1 =	 {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/davis.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858575&type=pdf},
  abstract =	 {Virtual machines (VMs) are a popular target for
                  language implementers. Conventional wisdom tells us
                  that virtual stack architectures can be implemented
                  with an interpreter more efficiently, since the
                  location of operands is implicit in the stack
                  pointer. In contrast, the operands of register
                  machine instructions must be specified
                  explicitly. In this paper, we present a working
                  system for translating stack-based Java virtual
                  machine (JVM) code to a simple register code. We
                  describe the translation process, the complicated
                  parts of the JVM which make translation more
                  difficult, and the optimisations needed to eliminate
                  copy instructions. Experimental results show that a
                  register format reduces the number of executed
                  instructions by 34.88%, while increasing the number
                  of bytecode loads by an average of 44.81%. Overall,
                  this corresponds to an increase of 2.32 loads for
                  each dispatch removed. We believe that the high cost
                  of dispatches makes register machines attractive
                  even at the cost of increased loads.}
}

@InProceedings{sullivan+03,
  author =	 {Gregory T. Sullivan and Derek L. Bruening and Iris
                  Baron and Timothy Garnett and Saman Amarasinghe},
  title =	 {Dynamic Native Optimization of Interpreters},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {50--57},
  year =	 {2003},
  url1 =	 {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/sullivan.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858576&type=pdf},
  abstract =	 {For domain specific languages, "scripting
                  languages", dynamic languages, and for virtual
                  machine-based languages, the most straight-forward
                  implementation strategy is to write an
                  interpreter. A simple interpreter consists of a loop
                  that fetches the next bytecode, dispatches to the
                  routine handling that bytecode, then loops. There
                  are many ways to improve upon this simple mechanism,
                  but as long as the execution of the program is
                  driven by a representation of the program other than
                  as a stream of native instructions, there will be
                  some "interpretive overhead".\par There is a long
                  history of approaches to removing interpretive
                  overhead from programming language
                  implementations. In practice, what often happens is
                  that, once an interpreted language becomes popular,
                  pressure builds to improve performance until
                  eventually a project is undertaken to implement a
                  native Just In Time (JIT) compiler for the
                  language. Implementing a JIT is usually a large
                  effort, affects a significant part of the existing
                  language implementation, and adds a significant
                  amount of code and complexity to the overall code
                  base.\par In this paper, we present an innovative
                  approach that dynamically removes much of the
                  interpreted overhead from language implementations,
                  with minimal instrumentation of the original
                  interpreter. While it does not give the performance
                  improvements of hand-crafted native compilers, our
                  system provides an appealing point on the language
                  implementation spectrum.}
}

@InProceedings{whaley03,
  author =	 {John Whaley},
  title =	 {Joeq: A Virtual Machine and Compiler Infrastructure},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {58-66},
  year =	 {2003},
  url1 =	 {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/whaley.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858577&type=pdf},
  abstract =	 {Joeq is a virtual machine and compiler
                  infrastructure designed to facilitate research in
                  virtual machine technologies such as Just-In-Time
                  and Ahead-Of-Time compilation, advanced garbage
                  collection techniques, distributed computation,
                  sophisticated scheduling algorithms, and advanced
                  run time techniques. Joeq is entirely implemented in
                  Java, leading to reliability, portability,
                  maintainability, and efficiency. It is also
                  language-independent, so code from any supported
                  language can be seamlessly compiled, linked, and
                  executed - all dynamically. Each component of the
                  virtual machine is written to be independent with a
                  general but well-defined interface, making it easy
                  to experiment with new ideas. Joeq is released as
                  open source software, and is being used as a
                  framework by researchers all over the world on
                  topics ranging from automatic distributed virtual
                  machines to whole-program pointer analysis.}
}

@InProceedings{palacz+03,
  author =	 {K. Palacz and J. Baker and C. Flack and C. Grothoff
                  and H. Yamauchi and J. Vitek},
  title =	 {Engineering a Customizable Intermediate
                  Representation},
  booktitle =	 {Interpreters, Virtual Machines and Emulators
                  (IVME~'03)},
  pages =	 {67--76},
  year =	 {2003},
  url1 =	 {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/palacz.ps.gz},
  url2 =	 {http://portal.acm.org/ft_gateway.cfm?id=858578&type=pdf},
  abstract =	 {The Ovm framework is a set of tools and components
                  for building language runtimes. We present the
                  intermediate representation and software design
                  patterns used throughout the framework. One of the
                  main themes in this work has been to support
                  experimentation with new linguistic constructs and
                  implementation techniques. To this end, framework
                  components were designed to be parametric with
                  respect to the instruction set on which they
                  operate. We argue that our approach eases the task
                  of writing new components without sacrificing
                  efficiency.}
}

@TechReport{bak&griesemer03,
  author =	 {Lars Bak and Robert Griesemer},
  title =	 {Interpreting Functions Utilizing a Hybrid of Virtual
                  and Native Machine Instructions},
  institution =	 {US},
  year =	 {2003},
  type =	 {Patent},
  number =	 {6513156 B2},
  annote =	 {This patent describes how to replace some
                  interpreted sequences in JVM code (apparently
                  restricted to straight-line code) with native code
                  (somewhat like \cite{yannikos94}). The native code
                  is apparently generated by macro-expansion of the
                  JVM instructions, like it is done in simple Forth
                  native-code compilers \cite{rose86,paysan91}. There
                  is no explanation how the native code for each JVM
                  instruction is generated. The patent discusses the
                  management of native-code snippets a lot and
                  presents a complex solution, but does not give a
                  rationale for that. No evaluation of the proposed
                  approach is given, and the presentation is pretty
                  bad.}
}

@TechReport{griesemer01,
  author =	 {Robert Griesemer},
  title =	 {Interpreter Generation and Implementation Utilizing
                  Interpreter States and Register Caching},
  institution =	 {US},
  year =	 {2001},
  type =	 {Patent},
  number =	 {6192516 B1},
  annote =	 {This patent describes a JVM interpreter with dynamic
                  stack caching, and how it is generated. The stack
                  cache keeps 0 or 1 stack items in registers; the
                  interesting variation here is that the system has
                  four states for one stack item in registers, one
                  state for each type (int, long, float, double). The
                  interpreter is generated by producing, for each
                  state/instruction combination, a prefix that sets up
                  the stack state, a template that does the main work,
                  and a suffix that dispatches the next instruction
                  (somewhat like vmgen \cite{ertl93,ertl+02}). The
                  native code for these parts is generated in a
                  machine-specific way (through an
                  assembler-in-C++). The paper also mentions how to
                  share code between different implementations of an
                  instruction for different states. The patent
                  presents no evaluation of the approach.}
}

@InProceedings{griesemer99,
  author =	 {Robert Griesemer},
  title =	 {Generation of Virtual Machine Code at Startup},
  booktitle =	 {OOPSLA '99 Workshop on Simplicity, Performance, and
                  Portability in Virtual Machine Design},
  year =	 {1999},
  annote =	 {This paper argues that using ordinary assemblers for
                  writing part of the program has several
                  disadvantages, and proposes generating machine code
                  through an assembler-in-C++ at run-time. The example
                  given is the HotSpot JVM interpreter, which is
                  generated at startup of the JVM in this way. An
                  additional benefit in this context was that the
                  infrastructure for the machine-code generation is
                  also needed for the JIT part of HotSpot.}
}

@InProceedings{costa99,
    Author = {Santos Costa, V\'{\i}tor},
    Title = {Optimising Bytecode Emulation for {Prolog}},
     Booktitle = "LNCS 1702, Proceedings of PPDP'99",
     Publisher="Springer-Verlag",
     Month = {September},
     Pages="261--267",
     Year = "1999" }

@Article{grant+00,
  author =	 {Brian Grant and Markus Mock and Matthai Philipose
                  and Craig Chambers and Susan J. Eggers},
  title =	 {{DyC}: An Expressive Annotation-Directed Dynamic
                  Compiler for {C}},
  journal = 	 {Theoretical Computer Science},
  year = 	 {2000},
  volume =	 {248},
  number =	 {1--2},
  pages =	 {147--199},
  abstract-url = {http://www.cs.washington.edu/research/projects/unisw/DynComp/www/Papers/tr-97-03-03-abstract.html},
  url =		 {http://www.cs.washington.edu/research/projects/unisw/DynComp/www/Papers/tr-97-03-03.ps.gz},
  OPTannote = 	 {}
}

@InProceedings{peng+04,
  author = 	 {Jinzhan Peng and Gansha Wu and Guei-Yuan Lueh},
  title = 	 {Code Sharing among States for Stack-Caching Interpreter},
  crossref =	 {ivme04},
  pages =	 {15--22},
  year =	 {2004},
  OPTannote = 	 {}
}

@InProceedings{vitale&abdelrahman04,
  author =	 {Benjamin Vitale and Tarek S. Abdelrahman},
  title =	 {Catenation and Specialization for {Tcl} Virtual
                  Machine Performance},
  crossref =	 {ivme04},
  pages =	 {42--50},
  year =	 {2004},
  OPTannote =	 {}
}

@Proceedings{ivme04,
  booktitle = 	 {IVME '04 Proceedings},
  title = 	 {IVME '04 Proceedings},
  year = 	 {2004},
  OPTeditor = 	 {}
}

@MastersThesis{wu96,
  author =	 {Qunyan Wu},
  title =	 {Register Allocation via Hierarchical Graph Coloring},
  school =	 {Michigan Technological University},
  year =	 {1996},
  url =		 {ftp://cs.mtu.edu/pub/carr/qwu.thesis.ps.gz},
  annote =	 {The author compares Hierarchical Graph Coloring
                  \cite{callahan&koblenz91} with Briggs' Graph
                  Colouring \cite{briggs+89} (without Briggs' version
                  of live range splitting). The results indicate that
                  hierarchical graph coloring does worse on most
                  benchmarks. The paper looks at several variations of
                  hierarchical graph coloring to identify the
                  performance impact of the different elements of
                  hierarchical graph coloring: Using different numbers
                  of reserved registers (the more are reserved, the
                  worse the results are for most benchmarks);
                  eliminating tiling in several stages (splitting the
                  program into less tiles usually has a positive
                  effect on the results); the effect of using register
                  preferences (mostly little effect, but occasionally
                  large effects in both directions); and what happens
                  if spill cost is ignored (then hierarchical graph
                  colouring loses because it introduces additional
                  jumps (to get single-entry-single-exit tiles) and
                  moves). In general the evaluation is quite nice and
                  very thorough, but I wonder if adding some
                  postprocessing to eliminate the additional jumps
                  would not have made hierarchical graph coloring look
                  better throughout all variations (but probably not
                  good enough to make it worthwhile).}
}

@PhdThesis{winkel04,
  author =	 {Sebastian Winkel},
  title =	 {Optimal Instruction Scheduling for the Itanium
                  Processor Architecture},
  school =	 {Universit\"at des Saarlandes},
  year =	 {2004},
  url =		 {http://www.dagstuhl.de/files/Proceedings/05/05101/05101.WinkelSebastian.Paper.pdf},
  OPTannote =	 {}
}

@Article{fu+05,
  author =       {Changqing Fu and Kent Wilken and David Goodwin},
  title =        {A Faster Optimal Register Allocator},
  journal =      {Journal of Instruction-Level Parallelism},
  year =         {2005},
  volume =       {7},
  OPTnumber =    {},
  OPTpages =     {},
  month =        jan,
  note =         {http://www.jilp.org/vol7/v7paper1.pdf},
  url =          {http://www.jilp.org/vol7/v7paper1.pdf},
  annote =       {This paper reports progress over earlier work
                  \cite{goodwin&wilken96} on optimal register
                  allocation with integer (linear) programming: Many
                  of the load points, store points, or deallocation
                  points used in the original model are unnecessary
                  for optimality. A model with fewer such points can
                  be solved faster. The paper presents ways to
                  eliminate most of the redundant points, and
                  evaluates them on the SPEC CPU92 and SPEC CPU2000
                  benchmarks. It also reevaluates the old model on
                  current hardware with a current ILP solver, and
                  compares the three sets of results, showing that all
                  three components (hardware, the better ILP solver,
                  and the improved model) contribute 1-2 orders of
                  magnitude of speedup. The new optimal register
                  allocator can optimally allocate 98\% of the
                  functions in the SPEC CPU2000 benchmarks and find a
                  near-optimal result for another 1\%, resulting in a
                  dynamic instruction reduction on HP-PA of
                  5.3\%--19.2\% over a graph colouring register
                  allocator. The boundary between solved and unsolved
                  is for function sizes in the range 90-2000
                  instruction.}
}

@InProceedings{makarov04,
  author =	 {Vladimir N. Makarov},
  title =	 {Figthting Register Pressure in {GCC}},
  booktitle =	 {{GCC} Developers' Summit 2004},
  pages =	 {85--103},
  year =	 {2004},
  url =          {http://gcc.fyxm.net/summit/2004/Fighting%20Register%20Pressure.pdf},
  annote =	 {This paper discusses a number of imporvements for
                  the original register allocator of gcc (which is
                  still competetive), and presents results for each of
                  them on the SPEC CPU2000 or CPU95 benchmarks on
                  Pentium~4 or Athlon~MP machines (i.e., they were all
                  implemented and worked). There are so many results
                  that I don't want to summarize all of them here
                  (read the paper); one of the results is that even
                  register-pressure sensitive prepass instruction
                  scheduling led to significant slowdowns.}
}

@InProceedings{kim&lipasti04,
  author =	 {Ilhyun Kim and Mikko H. Lipasti},
  title =	 {Understanding Scheduling Replay Schemes},
  booktitle =	 {10th International Symposium on High Performance
                  Computer Architecture (HPCA'04)},
  pages =	 {198--209},
  year =	 {2004},
  url =		 {http://www.ece.wisc.edu/~ikim/hpca2004ikim.pdf},
  annote =	 {Hardware schedulers have to issue instructions
                  before they know the exact latencies of the
                  instructions they depend on (e.g., latencies can
                  vary because of cache misses). Replay schemes deal
                  with this problem by scheduling instructions early,
                  and canceling and replaying them if their operands
                  are not available. However, the scheduler may
                  already have issues instructions depending on that
                  instruction, and they all have to be canceled and
                  replayed. This paper explores various replay
                  schemes, discusses their strength and weaknesses,
                  and presents empirical results.}
}

@Misc{mashey2005,
  author =	 {John Mashey},
  title =	 {{SPEC} use of Geometric Mean},
  howpublished = {Usenet Article
                  <1115972116.172947.194880@g44g2000cwa.googlegroups.com>},
  month =	 may,
  year =	 {2005},
  url =          {http://groups.google.at/group/comp.arch/msg/416e58b5e48c1715},
  annote =	 {Discusses why the geometric mean is appropriate in
                  many cases for aggregating benchmark results (like
                  SPEC) into one number: The statistical distribution
                  of the values is usually log-normal (and the reasons
                  for this are discussed), and the right way to
                  aggregate such values is the geometric mean (which
                  is the arithmetic mean in the log scale).}
}

@Article{bartley92,
  author =	 {David H. Bartley},
  title =	 {Optimizing Stack Frame Accesses for Processors with
                  Restricted Addressing Modes},
  journal =	 spe,
  year =	 {1992},
  volume =	 {22},
  number =	 {2},
  pages =	 {101--110},
  OPTannote =	 {}
}

@Article{liao+96,
  author =	 {Stan Liao and Srinivas Devadas and Kurt Kreutzer and
                  Steve Tijang and Albert Wang},
  title =	 {Storage Assignment to Decrease Code Size},
  journal =	 toplas,
  year =	 {1996},
  volume =	 {18},
  number =	 {3},
  pages =	 {235--253},
  OPTannote =	 {}
}

@Manual{bundy+85,
  title =	 {The Researcher's Bible},
  author =	 {Alan Bundy and Ben du Bolay and Jim Howe and Gordon
                  Plotkin},
  year =	 {1985},
  url =
                  {http://homepages.inf.ed.ac.uk/bundy/how-tos/resbible.html},
  annote =	 {Good advice on the problems on the way to a
                  Ph.D. and how to overcome them. An earlier version
                  is \cite{bundy+84}; this version is being maintained.}
}

@Article{aycock03,
  author =       {John Aycock},
  title =        {A Brief History of Just-In-Time},
  journal =      {ACM Computing Surveys},
  year =         {2003},
  volume =       {35},
  number =       {2},
  pages =        {97--113},
  month =        jun,
  annote =       {Gives an overview about the research in systems that
                  generate code at run-time. Contains many references,
                  including some to papers that previously escaped my
                  literature searches.}
}

@Misc{kuhn05unicode,
  author =       {Markus Kuhn},
  title =        {{UTF-8} and {Unicode} {FAQ} for {Unix/Linux}},
  howpublished = {http://www.cl.cam.ac.uk/\~{}mgk25/\linebreak[0]unicode.html},
  year =         {2005},
  OPTannote =    {}
}

@Misc{pelc&knaggs01widechar,
  author =       {Stephen Pelc and Peter Knaggs},
  title =        {{ANS} {Forth} and Large Characters},
  howpublished = {http://www.mpeforth.com/arena/\linebreak[0]i18n.widechar.v7.PDF},
  year =         {2001},
  OPTannote =    {}
}

@Article{rather85,
  author =       {Elizabeth D. Rather},
  title =        {Fifteen Programmers, 400 Computers, 36,000 Sensors
                  and {FORTH}},
  journal =      jfar,
  year =         {1985},
  volume =       {3},
  number =       {2},
  pages =        {46--73},
  annote =       {Describes how Forth was used in an automation system
                  for the Riyadh airport.}
}

@Article{ierusalimschy+05,
  author =	 {Roberto Ierusalimschy and Luiz Henrique de
                  Figueiredo and Waldemar Celes},
  title =	 {The Implementation of Lua 5.0},
  journal =	 {Journal of Universal Computer Science},
  year =	 {2005},
  volume =	 {11},
  number =	 {7},
  pages =	 {1159--1176},
  url =          {http://www.tecgraf.puc-rio.br/~lhf/ftp/doc/jucs05.pdf},
  annote =	 {This paper discusses a number of implementation
                  issues in Lua 5.0 (and how they relate to the Lua
                  design goals). It is well-written and does not
                  require familiarity with Lua. The issues are: how
                  values are represented; an optimization of (hash)
                  tables such that arrays are used for a part of the
                  table that is indexed with dense integer keys; how
                  free variables in closures are represented; the
                  implementation of threads and coroutines; and the
                  virtual machine. The virtual machine was changed
                  from a stack-based VM to a ``register''-based one in
                  Lua 5.0 (actually it's an indexed-stack VM, or an
                  IA64-style register stack); the paper gives some
                  reasoning and presents advantages of this VM style
                  (although it seems a little one-sided to me); it
                  also provides timing results: the speedups from the
                  VM change alone are 1.02--1.40 for a selection of
                  the shootout benchmarks, and 2.28 for a
                  microbenchmark. The paper does not say so
                  explicitly, but one reason for the good performance
                  of the new VM over the old stack-based VM is
                  probably that both VMs use the slow switch-based
                  dispatch method (for ANSI C compliance), so the
                  reductions in executed VM instructions from the new
                  VM have more effect than they would have with more
                  efficient dispatch methods.}
}

@Book{lindholm&yellin97,
  author =       {Tim Lindholm and Frank Yellin},
  title =        {The Java Virtual Machine Specification},
  publisher =    {Addison-Wesley},
  year =         {1997},
  edition =      {First edition}
}

@Book{lindholm&yellin99,
  author =       {Tim Lindholm and Frank Yellin},
  title =        {The Java Virtual Machine Specification},
  publisher =    {Addison-Wesley},
  year =         {1999},
  edition =      {Second edition}
}

@Article{thibault+00,
  author =	 {Scott Thibault and Charles Consel and Julia
                  L. Lawall and Renaud Marlet and Gilles Muller},
  title =	 {Static and Dynamic Program Compilation by
                  Interpreter Specialization},
  journal =	 {Higher-Order and Symbolic Computation},
  year =	 {2000},
  volume =	 {13},
  number =	 {3},
  pages =	 {161--178},
  month =	 sep,
  annote =	 {The authors use the Tempo specializer to convert
                  several interpreters to compilers; compile-time
                  specialization is used to convert interpreters to
                  ahead-of-time compilers, run-time specialization for
                  JIT compilers. They do this for three byte-code
                  interpreters: Harissa (JVM), Ocaml, and the Berkeley
                  Packet Filter. They have to transform the
                  interpreter into a tail-recursive function so that
                  the specializer can deal with conditional VM
                  branches. Indirect VM branches require a translation
                  table from VM-code addresses to specialized-function
                  pointers; the paper does not tell what changes in
                  the interpreter this required. These introductions
                  of calls require that the VM registers are held in
                  global variables, which causes the biggest slowdown
                  of the resulting code compared to native-code
                  compilers. The authors also applied Tempo to two
                  interpreters described as \emph{structured code
                  interpreters} for domain-specific languages (PLAN-P
                  and GAL); these interpretive systems consume source
                  code, and it is not clear what intermediate
                  representation they use, if any; the paper also does
                  not discuss what changes were applied to these two
                  interpreters to facilitate specialization. For all
                  interpreters, the paper presents encouraging
                  performance data. The paper is nicely written and
                  easy to read.}
}

@TechReport{arnold+04,
  author =       {Matthew Arnold and Stephen J. Fink and David Grove
                  and Michael Hind and Peter F. Sweeney},
  title =        {A Survey of Adaptive Optimization in Virtual
                  Machines},
  institution =  {IBM},
  year =         {2004},
  type =         {Research Report},
  number =       {RC23143 (W0312-097)},
  annote =       {The title says it all. It is certainly a good place
                  to start if you want to get into the area, even
                  though the writing style is not very exciting and
                  the paper mentions only a few of the older works on
                  the topic, and covers mostly newer work in
                  its 165 references.}
}

@InProceedings{gagnon&hendren03,
  author =	 {Etienne Gagnon and Laurie Hendren},
  title =	 {Effective Inline-Threaded Interpretation of {Java}
                  Bytecode Using Preparation Sequences},
  booktitle =	 {Compiler Construction (CC '03)},
  pages =	 {170--184},
  year =	 {2003},
  volume =	 {2622},
  series =	 {LNCS},
  publisher =	 {Springer},
  annote =	 {This paper is about implementing dynamic
                  superinstructions in the SableVM, and especially
                  about dealing with lazy class initialization in that
                  context. The starting point is a version of the JVM
                  translated into threaded code. Race conditions in
                  quickening are avoided by using an additional field
                  in the instruction. The main contribution of the
                  papers are the preparation sequences: out-of-line
                  threaded-code sequences of simple and non-quick VM
                  instructions, with a REPLACE instruction after the
                  last non-quick instruction that rewrites the GOTO to
                  the out-of-line sequence into the dynamic
                  superinstruction (with everything
                  quickened). SableVM superinstructions do not keep
                  the instruction slots for all original instructions,
                  so the preparation cannot be performed in-line (as
                  is done in the Cacao interpreter). The paper
                  contains empirical results for the overall effect of
                  dynamic superinstructions (a speedup by up to a
                  factor of 2.14 (for compress), and for the effect of
                  using preparation sequences compared to not putting
                  quickenable instructions into dynamic
                  superinstructions (a factor of up to 1.49
                  (compress)).}
}

@Book{graham04,
  author =       {Paul Graham},
  title =        {Hackers \& Painters},
  publisher =    {O'Reilly},
  year =         {2004},
  annote =       {A collection of essays on various topics: US high
                  schools, Startups, Economics, Programming
                  Languages. While I don't buy everything he writes,
                  the essays are interesting and easy to read, and
                  contain interesting ideas.}
}

@InProceedings{berndl+05,
  author =       {Marc Berndl and Benjamin Vitale and Mathew Zaleski
                  and Angela Demke Brown},
  title =        {Context Threading: A Flexible and Efficient Dispatch
                  Technique for Virtual Machine Interpreters},
  booktitle =    {Code Generation and Optimization (CGO)},
  pages =        {15--26},
  year =         {2005},
  annote =       {Dynamic superinstructions with replication decrease
                  the number of indirect branch mispredictions, but
                  can substantially increase the number of I-cache
                  misses. Context threading is a variant of subroutine
                  threading that addresses these problems: The basic
                  technique is a (non-inlining) subroutine-threaded
                  variant of dynamic superinstructions (which can be
                  considered as the inlining variant of context
                  threading): in addition to the subroutine calls,
                  there is still a direct threaded code area for used
                  for inline arguments and for control flow. In order
                  to get rid of the indirect jumps and their
                  mispredictions on control flow, context threading
                  translates virtual machine branches, calls and
                  returns in a special way (in particular, calls and
                  returns use the hardware return stack); they also
                  explore inlining very short VM instruction
                  implementations. The authors evaluate their approach
                  on the Ocaml interpreter and on SableVM on a
                  Pentium~4, PPC~7410, and PPC970; they achieve large
                  reductions in branch mispredictions, and decent
                  speedups. The paper does not discuss the amount of
                  code necessary per CPU or the problems that using an
                  SP-changing call instruction can cause (they mention
                  it for the call optimization, but not for the
                  subroutine threading).}
}

@InProceedings{hsieh+01,
  author =       {Wilson C. Hsieh and Dawson R. Engler and Godmar
                  Back},
  title =        {Reserse-Engineering Instruction Encodings},
  booktitle =    {USENIX Annual Technical Conference},
  OPTpages =     {},
  year =         {2001},
  url1 =         {http://www.cs.utah.edu/~wilson/papers/derive-usenix01.pdf},
  url2 =         {http://www.cs.utah.edu/~wilson/papers/derive-usenix01.html},
  annote =       {DERIVE is a tool that takes a description of valid
                  assembler inputs, feeds a subset of the possible
                  inputs to the assembler, and determines feeds them
                  through the assembler, looks at the output, and
                  determines the instruction encodings, which can then
                  be used for JIT compilers. The paper describes how
                  DERIVE works and how it is used. DERIVE has been
                  used to determine the encodings of six architectures
                  (two only partially). The DERIVE code is available
                  on-line.}
}

@Article{wirth06,
  author =       {Niklaus Wirth},
  title =        {Good Ideas, Through the Looking Glass},
  journal =      ieeecomputer,
  year =         {2006},
  pages =        {28--39},
  month =        jan,
  annote =       {The author looks at some past technologies in
                  hardware, computer architecture, and programming
                  languages, most of which he considers bad ideas in
                  hindsight. In some cases (e.g., bubble memory), his
                  opinion is probably universally held now, so his
                  review seems kind of pointless; in other cases, his
                  opinion is more controversial (e.g., for functional
                  programming) or contrary to mainstream opinion (for
                  virtual memory), so the paper may be good for
                  inspiring discussions, if nothing else.}
}

@Article{thompson68,
  author =       {Ken Thompson},
  title =        {Regular Expression Search Algorithm},
  journal =      jacm,
  year =         {1968},
  volume =       {11},
  number =       {6},
  pages =        {419--422},
  month =        jun,
  annote =       {Regular expressions are first translated into
                  postfix representation, then into machine code for
                  the IBM 7094, where the flow diagram looks like a
                  syntax diagram (or a funny variant of the NFA) of
                  the regular expression (it is unclear what control
                  flow actually happens for the $\oplus$ (alternative)
                  node). The paper presents Algol code for the
                  translation, and IBM 7094 assembly code for the
                  resulting output. Supposedly this paper does lazy
                  NFA->DFA conversion, but that is not discussed in
                  the paper; it might be implicit in the 7094 assembly
                  code for CNODE and
                  NNODE. \code{<news:06-01-140@comp.compilers>}
                  discusses this in more detail. In any case, this
                  implementation does not seem to retain the DFA
                  states.}
}

@InProceedings{nethercote&mycroft02,
  author =       {Nicholas Nethercote and Alan Mycroft},
  title =        {The Cache Behaviour of Large Lazy Functional
                  Programs on Stock Hardware},
  booktitle =    {Memory Systems Performance (MSP '02)},
  pages =        {44-55},
  year =         {2002},
  url =          {http://www.cs.mu.oz.au/~njn/pubs/cache-large-lazy2002.ps.gz},
  annote =       {This paper analyses the performance mainly of
                  GHC-compiled Haskell programs on an Athlon using
                  mainly performance counters. The paper looks at
                  varying the GC nursery size (160KB seems to be quite
                  good for most programs on that CPU (256KB exclusive
                  L2 cache); they also varied the initial heap size,
                  and that has different effects on different
                  programs, but the paper gives no explanation for
                  that. They use a simple linear execution cost model
                  ($\mathrm{cycles} = 0.8\mathrm{instructions} +
                  12\mathrm{D-cache-misses} +
                  206\mathrm{L2-cache-misses} +
                  10\mathrm{branch-mispredictions}$), which proves to
                  be surprisingly accurate for most benchmarks; this
                  helps them explain the results: Two significant
                  sources of stalls are L2 cache misses (for a good
                  part write misses in the heap) and branch
                  mispredictions (mainly from indirect branches, which
                  are very frequent in GHC; apparently most calls and
                  returns turn into indirect branches). They also
                  perform a more detailed analysis of the reasons for
                  cache misses based on simulation results. In
                  addition, they also compare with some SML/NJ and C
                  programs; it turns out that the GHC programs have a
                  significantly higher CPI (typically 1.5--3) compared
                  to the SML/NJ (1.2--1.6) and the C (1.1--1.5)
                  programs.}
}

@Article{heering+90,
  author =       {Jan Heering and Paul Klint and Jan Reekers},
  title =        {Incremental Generation of Parsers},
  journal =      {IEEE Transactions on Software Engineering},
  year =         {1990},
  volume =       {16},
  number =       {12},
  pages =        {1344--1351},
  month =        dec,
  annote =       {}
}

@InProceedings{hack+06,
  author =       {Sebastian Hack and Daniel Grund and Gerhard Goos},
  title =        {Register Allocation for Programs in {SSA}-Form},
  booktitle =    {Compiler Construction {CC'06}},
  pages =        {247--262},
  year =         {2006},
  volume =       {3923},
  series =       {LNCS},
  publisher =    {Springer},
  annote =       {The present a register allocation algorithm for
                  programs in SSA form. As long as the program is in
                  SSA form, its interference graph is chordal and
                  optimally colourable in polynomial time. So the
                  register allocation algorithm works by spilling,
                  then colouring, then coalescing, and finally getting
                  out of SSA form. The paper discusses at first why
                  the colouring is not the problem in this approach,
                  then discuss the other parts: spilling is done
                  heuristically with something inspired by Belady's
                  algorithm; SSA destruction is done by finally
                  translating the phi functions into sequences of
                  register swaps; finally, the paper talks a lot about
                  coalescing (and colouring), such that many values
                  stay in the same register during phi functions (so
                  SSA destruction has less to do). For this part, the
                  paper also presents empirical results, comparing the
                  heuristic they developed with a version that does
                  not do coalescing, and with a version that uses
                  integer linear programming to find an optimal
                  solution in most cases. Their heuristic is much
                  closer to the optimum.}
}

@InProceedings{choi+99,
  author =	 {Jong-Deok Choi and David Grove and Michael Hind and
                  Vivek Sarkar},
  title =	 {Efficient and Precise Modeling of Exceptions for the
                  Analysis of {Java} Programs},
  booktitle =	 {Program Analysis for Software Tools and Engineering
                  (PASTE'99)},
  OPTpages =	 {},
  year =	 {1999},
  OPTnote =	 {},
  annote =	 {Presents the Factored Control Flow Graph, a
                  representation of the control flow that represents
                  control flow edges resulting from potential
                  exception-throwing instructions (PEIs) more
                  efficiently (basically, it does not treat PEIs as
                  branches for block formation, and creates only one
                  set of exception edges per basic block instead of
                  one per PEI. This helps in reducing the number of
                  basic blocks and control flow edges significantly,
                  which helps compile-time and memory consumption.}
}

@TechReport{rodeh06,
  author =       {Ohad Rodeh},
  title =        {B-Trees, Shadowing, and Clones},
  institution =  {IBM},
  year =         {2006},
  type =         {IBM Research Report},
  number =       {H-0245 (H0611-006)},
  month =        nov,
  url =          {http://www.cs.huji.ac.il/~orodeh/papers/ibm-techreport/H-0245.pdf},
  slides-url =   {http://www.cs.huji.ac.il/~orodeh/papers/LinuxFS_Workshop.pdf},
  note =         {Presented at the 2007 Linux Storage and File Systems
                  Workshop},
  annote =       {The main focus of this work is making B-Trees work
                  efficiently in copy-on-write-style file systems
                  (called ``shadowing'' in this paper). The paper
                  discusses the problems that popular B-tree variants
                  have in a COW environment (the whole tree might be
                  copied on a single insert or delete operation), and
                  suggests using $b+$-trees and a specific order of
                  processing the nodes in order to alleviate these
                  problems. However, B-Trees are designed to minimize
                  writes in an update-in-place system. Given that in a
                  copy-on-write system we write the whole path from
                  the root to the changed block anyway, I wonder if
                  there is some design that utilizes the additional
                  writes in a useful way. In any case, what I found
                  most interesting in this paper is that it uses
                  reference counts for free-blocks management in the
                  presence of clones. The paper gives performance
                  results for synthetic workloads.}
}

@InProceedings{pinheiro+07,
  author =       {Eduardo Pinheiro and Wolf-Dietrich Weber and Luiz
                  Andr\'e Barroso},
  title =        {Failure Trends in a Large Disk Drive Population},
  booktitle =    {5th USENIX Conference on File and Storage
                  Technologies (FAST '07)},
  OPTpages =     {},
  year =         {2007},
  month =        feb,
  annote =       {Reports data on the correlation of various hard disk
                  properties (in particular SMART output) with their
                  failure probability, based on $>100,000$ drives
                  installed at Google. The paper mentions that drive
                  model, manufacturer and vintage plays a role, but
                  does not give data on model and
                  manufacturer. Utilization and temperature did not
                  play a big role in failure probability (but the
                  drives were not run at really high temperatures (few
                  above 45C). From the SMART data, scan errors,
                  reallocations, offline reallocations and probational
                  counts were significantly correlated with failure
                  probability, whereas seek errors, calibration
                  retries and spin retries had little
                  significance. But on more than half of the failed
                  drives, the four strong indicators mentioned above
                  had no counts.}
}

@InProceedings{dybvig06,
  author =	 {R. Kent Dybvig},
  title =	 {The Development of {Chez Scheme}},
  booktitle =	 {International Conference on Functional Programming
                  (ICFP'06)},
  pages =	 {1--12},
  year =	 {2006},
  annote =	 {Gives a history of Chez Scheme and its precursors,
                  what technical innovations and features went into
                  which version, and who contributed.}
}

@InProceedings{fisher&shivers06,
  author =       {David Fisher and Olin Shivers},
  title =        {Static Analysis for Syntax Objects},
  booktitle =    {International Conference on Functional Programming
                  (ICFP'06)},
  pages =        {111--121},
  year =         {2006},
  annote =       {Presents an s-expression-based system for providing
                  macros to arbitrary languages, including support for
                  static analysis (e.g., type inference).}
}

@InProceedings{saabas&uustalu07,
  author =	 {Ando Saabas and Tarmo Ustalu},
  title =	 {Type Systems for Optimizing Stack-Based Code},
  booktitle =	 {ByteCode 2007 (ETAPS '07 workshop)},
  year =	 {2007},
  annote =	 {Performs a number of simple optimizations on JVM
                  code: dead-code elimination, load-pop elimination,
                  store-load optimization. The interesting parts are
                  that their optimizer is based on a type checker, and
                  that the optimizations are performed across basic
                  blocks (on unstructured code).}
}

@Article{benton+04,
  author =	 {Nick Benton and Luca Cardelli and Cèdric Fournet},
  title =	 {Modern Concurrency Abstractions for {C\#}},
  journal =	 toplas,
  year =	 {2004},
  volume =	 {26},
  number =	 {5},
  pages =	 {769--804},
  month =	 sep,
  annote =	 {Polyphonic C\# extends C\# with mechanisms for
                  communicating and synchronization between threads:
                  asynchronous methods and chords. Asynchronous
                  methods can be called with a non-blocking call (they
                  are always void). A chord is a method body that has
                  several method heads (at most one synchronous); the
                  body is only executed once all of the heads are
                  called; several chords can contain the same head;
                  selection among several matching chords is
                  indeterministic, as is the order among several
                  possible calls to a method. The theoretical
                  framework behind Polyphonic C\# is the join
                  calculus, but the paper does not delve into
                  that. Instead, it shows with program examples how
                  the new features can be used for various
                  parallel/distributed programming problems. It also
                  discusses how Polyphonic C\# is translated into
                  ordinary C\#, and gives some performance results,
                  mainly showing the cost of various operations in
                  microbenchmarks.}
}

@InProceedings{russel+06,
  author =	 {Francis P. Russell and Michael R. Mellor and Paul
                  H. J. Kelly and Olav Beckmann},
  title =	 {An Active Linear Algebra Library Using Delayed
                  Evaluation and Runtime Code Generation [Extended
                  Abstract]},
  booktitle =	 {Library-Centric Software Design (LCSD'06)},
  pages =	 {5--13},
  year =	 {2006},
  url =	 	 {http://www.doc.ic.ac.uk/~phjk/Publications/DelayedEvaluationRTCG-LCSD06-ExtendedAbstract.pdf},
  proceedingsurl = {http://sms.cs.chalmers.se/bibliography/proceedings/2006-LCSD.pdf},
  annote =	 {This work takes a description of a linear algebra
                  computation, applies some optimizations, generates
                  C++ (?) source code for it at run-time, then
                  compiles, links and executes that code. The
                  optimizations applied before generating source code
                  are loop fusion, array contraction, and liveness
                  analysis (however, the latter is not clear to me;
                  apparently liveness is a statistical property and
                  they use some learning approach to determine it, and
                  they fall back on regenerating the value if they
                  predicted wrongly that it is dead). On the run-time
                  code generation part they perform a caching
                  optimization, but only within a run (caching across
                  runs is mentioned as future work); they give a
                  rather detailed, yet confusing description of the
                  hashing and isomorphism checking they use in
                  caching. The paper presents performance results,
                  showing the effects of their optimizations (which do
                  not help in most cases), and also comparing with the
                  Matrix Template Library; the speedups depend on the
                  benchmark and the specific CPU (they used two
                  Pentium~4 variants, and still got significant
                  performance differences).}
}

@InProceedings{zhang+07,
  author =       {David Zhang and Qiuyuan J. Li and Rodric Rabbah and
                  Saman Amarasinghe},
  title =        {A Lightweight Streaming Layer for Multi-Core
                  Execution},
  booktitle =    {2007 Workshop on Design, Architecture and Simulation
                  of Chip Multi-Processors},
  OPTpages =     {},
  year =         {2007},
  url =          {http://cag.lcs.mit.edu/commit/papers/07/zhang-dascmp07.pdf},
  OPTannote =    {Describes a low-level library for implementing
                  streaming computations on the Cell platform, and
                  provides some empirical data. The library breaks the
                  program into tasks, that are then scheduled by a
                  scheduler. The tasks are not filters over the entire
                  data, but filters for a specific buffer size; the
                  same filter can run of different data on different
                  cores at the same time if there is data
                  parallelism. The restrictions of Cell lead to some
                  unusual design decisions: In particular, the filters
                  have to be loaded dynamically into the SPEs, and
                  they then have to be used for a long time in order
                  to amortize the loading cost. Probably because of
                  that, they choose a big buffer size of 1MB
                  (apparently some overheads are too large for buffer
                  sizes that fit into SPE's local memory), so they
                  have to put the data in main memory (supposedly this
                  is not a bottleneck for them, but then they don't
                  achieve top performance in other areas. There is an
                  insteresting discussion and empirical comparison of
                  static and dynamic scheduling; it seems that except
                  under special circumstances, dynamic scheduling
                  works better. The paper discusses a lot of
                  implementation stuff, but I still get the feeling
                  that I am missing a lot. However, quite interesting
                  overall, even though I think that the design and
                  implementation is quite Cell-specific. There is also
                  an interesting Related Work section.}
}

@InProceedings{gummaraju&rosenblum05,
  author =	 {Jayanth Gummaraju and Mendel Rosenblum},
  title =	 {Stream Programming on General-Purpose Processors},
  booktitle =	 {38th Annual International Symposium on
                  Microarchitecture (MICRO-38)},
  OPTpages =	 {},
  year =	 {2005},
  url =          {http://merrimac.stanford.edu/publications/micro38_streamingGPP.html},
  annote =	 {The authors implement a stream programming system on
                  a Pentium~4 with Hyperthreading and empirically
                  compare some applications with conventionally
                  programmed (single-threaded) variants. The
                  applications they use are of the usual scientific
                  type that stream programming is used for. The
                  implementation of stream programming is remarkably
                  similar to that on distributed-memory machines like
                  the Cell: Data is processed by breaking the work
                  into tasks that work on a single buffer (called
                  strip-mining in the paper), using double buffering
                  to allow several tasks to work in parallel, and
                  having a scheduler that determines which task runs
                  on which hardware thread. The buffers (called stream
                  register files (SRFs)) are sized so they all fit
                  into the L2 cache (1MB), with 1-2 ways per set left
                  free for main memory accesses. The programming seems
                  to divide the work mainly between pure memory access
                  (in particular scatter/gather, one such thread per
                  memory stream), and pure computation (which can
                  perform many computations in one kernel); the
                  scheduler tries to schedule a memory task on one
                  thread and a compute task on the other, because that
                  results in better performance, whereas having two
                  memory tasks in the two threads gives worse
                  performance than executing the tasks
                  sequentially. The paper presents microbenchmarks
                  that show this, and presents application benchmarks
                  for streaming and conventional programs that shows
                  that the streaming program can be faster (often a
                  factor 1.2) than the conventional program, but also
                  slower (more than a factor 2 slowdown for one
                  problem size of one benchmark). The explanation for
                  the speedup is that the memory system (e.g.,
                  hardware prefetching) works better if it has to work
                  on just one memory stream instead of several
                  intermixed ones as in the conventional code.}
}

@Book{oram&wilson07,
  editor =	 {Andy Oram and Greg Wilson},
  title =	 {Beautiful Code},
  publisher =	 {O'Reilly},
  year =	 {2007},
  annote =	 {This book contains 33 chapters, each by different
                  authors, most discussing a program written by the
                  author of the chapter. Some of the chapters are
                  excellent (e.g., the one by Jon Bentley or the one
                  on Mapreduce), some are at a much lower level (and
                  one of those (by Tim Bray) makes claims and
                  recommendations that are misleading). Most of the
                  chapters, however, discuss a large piece of
                  software, which is a hard task. These chapters often
                  required quite a bit of advance knowledge in the
                  problem solved, and in the method used for solving
                  it; not surprisingly, I found that I skipped much of
                  these chapters, because I found them uninteresting,
                  and/or because I found them too hard to follow. Many
                  of the chapters discuss performance improvements;
                  while I have a soft spot for this topic, I found it
                  overrepresented given the title of the book; but
                  maybe this just means that many programmers find
                  fast code beautiful.}
}

@Book{valloud08,
  author =       {Andr\'es Valloud},
  title =        {Hashing in Smalltalk: Theory and Practice},
  publisher =    {self-published (www.lulu.com)},
  year =         {2008},
  annote =       {This book mainly discusses non-cryptographic hash
                  functions, with a bit of background on hashing and
                  hash function testing. The main body of the book is
                  taken up by a discussion of various hash functions
                  found in the literature (chapter 5) and found in
                  Smalltalk implementations (chapter 7). The book has
                  a number of shortcomings: It lacks a literature list
                  with proper references. It is written in a verbose,
                  repetetive, and not very systematic way: in
                  particular, many hash functions are discussed in
                  arbitrary order, instead of categorizing them at a
                  somewhat finer level, and discussing all of the hash
                  functions of one category together; many similar
                  hash functions are discussed at length. To make
                  things worse, there is no executive summary to be
                  found (not even in the conclusions section, which
                  just introduces quality categories, but fails to
                  categorize the hash functions discussed until
                  then). There is also no index (which would help
                  alleviate some of the disadvantages of the verbose
                  style). Each hash function is described with a nice
                  table, but I did not find a definition of the "Hash
                  quality" metric, and the collisions etc. when
                  working on the raw hash function seem to be of
                  little interest. Hash function values modulo $p$ are
                  only evaluated for prime $p$s, but power-of-two $p$s
                  would be more interesting (the author summarily
                  ignores that option because he claims that it leads
                  to too many collisions); it also does not evaluate
                  hash functions wrt all the criteria discussed
                  earlier in the book, e.g., it does not perform
                  avalanche tests. Hash function code is presented in
                  Smalltalk, which makes it more verbose for this
                  purpose than, say, C, and also harder to understand
                  for many readers. Despite all these shortcomings,
                  the book can still be useful for someone who wants
                  an overview and some test results for a buch of hash
                  functions.}
}

@InProceedings{agakov+06,
  author =       {F. Agakov and E. Bonilla and J. Cavazos and
                  B. Franke and G. Fursin and M. F. P. O'Boyle and
                  J. Thompson and M. Toussaint and C.K.I. Williams},
  title =        {Using Machine Learning to Focus Iterative
                  Optimization},
  booktitle =    {Code Generation and Optimization (CGO'06)},
  pages =        {295--305},
  year =         {2006},
  url =          {http://homepages.inf.ed.ac.uk/jcavazos/cgo-2006.pdf},
  annote =       {Optimization can be improved by trying out different
                  orderings of optimization passes, and selecting the
                  one that gives the best result; however, this
                  approach results in long compile times. This paper
                  attacks this problem by doing an off-line training
                  to find out what works well for the particular
                  target architecture, and then needs much less
                  compile time to achieve results of similar
                  quality. The paper uses machine learning techniques
                  for this.}
}

@InProceedings{parker+92,
  author =       {D. Scott Parker and Eric Simon and Patrick
                  Valduriez},
  title =        {SVP -- a Model Capturing Sets, Streams, and
                  Parallelism},
  booktitle =    {Very Large Data Bases (VLDB'92)},
  pages =        {115--126},
  year =         {1992},
  annote =       {This paper presents some classical data and
                  computation structures (e.g., collections, and
                  divide-and-conquer) in a functional-programming way
                  and discusses how programs organized in that way can
                  be parallelized.}
}

@InProceedings{koes&goldstein08,
  author =	 {David Ryan Koes and Seth Copen Goldstein},
  title =	 {Near-Optimal Instruction Selection on {DAGs}},
  booktitle =	 {Code Generation and Optimization (CGO '08)},
  pages =	 {45--54},
  year =	 {2008},
  url =          {http://www.cs.cmu.edu/~dkoes/research/dkoes_cgo08.pdf},
  annote =	 {The instruction selection variant dealt with in the
                  paper corresponds to non-normalized tree grammars
                  with only one kind of non-terminal (this is called
                  \emph{tiling} in the paper). The paper provides a
                  proof that this restricted form of tree parsing is
                  still NP-complete for DAGs. The paper also presents
                  NOLTIS, a heuristic improvement over
                  straight-forward DAG parsing, and evaluates it;
                  unfortunately NOLTIS is restricted to the restricted
                  problem discussed above. The evaluation is the most
                  interesting part of the paper: They implement
                  various DAG instruction selection algorithms in LLVM
                  and compare NOLTIS to the original (greedy) LLVM
                  algorithm, plain DAG parsing with two different ways
                  on decomposing DAGs into trees, or without
                  decomposing them into trees, and an optimal
                  algorithm based on integer programming; the
                  comparison metrics are parsing cost, resulting code
                  size (after register allocation etc., and
                  instruction selection speed. NOLTIS is very close to
                  optimal, but takes twice as long as the plain
                  DAG-parsing variants, and the greedy LLVM algorithm
                  is even faster. Among the plain DAG-parsing
                  variants, the one that does not decompose into trees
                  produces the best code, whereas the one that
                  decomposes completely performs worst, even worse
                  than the greedy LLVM algorithm.}
}

@InProceedings{nethercote&seward07,
  author =	 {Nicholas Nevercote and Julian Seward},
  title =	 {Valgrind: A Framework for Heavyweight Dynamic Binary
                  Translation},
  booktitle =	 {Programming Language Design and Implementation (PLDI'07)},
  pages =	 {89--100},
  year =	 {2007},
  annote =	 {Gives a nice overview of the Valgrind framework for
                  binary instrumentation, explains the motivation,
                  requirements, implementation techniques, compares
                  with competing systems and presents data for
                  evaluating various aspects.  Valgrind works like a
                  binary translator, disassembling the original
                  program into an IR, then (unlike a translator) the
                  tool-specific part inserts code in the IR, and then
                  the IR is compiled back into machine code.  This
                  approach is less efficient than some others, but
                  allows more powerful tools (thus the
                  \emph{heavyweight} part of the title).}
}

@InCollection{stroustroup01,
  author = 	 {Bjarne Stroustrup},
  title = 	 {Exception safety: concepts and techniques},
  booktitle = 	 {Advances in exception handling techniques},
  pages =	 {60--76},
  publisher =	 {Springer LNCS~2022},
  year =	 2001,
  url =		 {http://www.research.att.com/~bs/except.pdf},
  annote =	 {a}
}

@InProceedings{reddi+07,
  author =	 {Vijay Janapa Reddi and Dan Connors and Robert Cohn
                  and Michael D. Smith},
  title =	 {Persistent Code Caching: Exploiting Code Reuse
                  Across Executions and Applications},
  booktitle =	 {Code Generation and Optimization (CGO '07)},
  pages =	 {74--88},
  year =	 {2007},
  url =		 {http://rogue.colorado.edu/draco/papers/cgo-07-persistence.pdf},
  annote =	 {Describes and empirically evaluates persistant code
                  caching as used in the Pin dynamic instrumentation
                  tool. The evaluation starts with same-input caching
                  across executions, and then proceeds to
                  different-input caching, and across-application
                  caching; unsurprisingly, these techniques help; more
                  interesting is that their effectiveness varies quite
                  a lot between applications, and for varying
                  reasons. The design apparently can only deal with
                  one cache per run, leading to some consequences and
                  evaluations that would probably not be done in a
                  system that can support more liberal mixing of
                  persistently cached code. One interesting
                  observation is that the meta-data is bigger than the
                  code.}
}

@InProceedings{bruening&kiriansky08,
  author =	 {Derek Bruening and Vladimir Kiriansky},
  title =	 {Process-Shared and Persistent Code Caches},
  booktitle =	 {Virtual Execution Environments (VEE'08)},
  pages =	 {61--70},
  year =	 {2008},
  url =          {http://www.burningcutlery.com/derek/docs/procshared-VEE08.pdf},
  annote =	 {An in-depth discussion of various design issues for
                  persistent and shared code caches, with particular
                  concern for security issues.  The context it binary
                  instrumentation based on DynamoRio.  They produce
                  one cache file for each \emph{module} (shared object
                  file).  The code (and much of the data) in these
                  files is read-only and thus shared between processes
                  using the cache; the rest of the data is privately
                  writable.  For security, a user uses only caches
                  created by himself or by root (there is some
                  strangeness about the directories in Section 3.2).
                  Most of the design decisions are pretty
                  straightforward, but there are a lot of issues
                  discussed in the paper.  The evaluation shows the
                  memory savings due to sharing, and the startup speed
                  advantage due to persistence, both of which are
                  substantial.  The paper contains a long
                  \emph{Related Work} section and cites a lot of
                  papers.}
}

@Article{brauer+08,
  author =	 {Johannes Brauer and Christoph Crasemann and Hartmut
                  Krasemann},
  title =	 {Auf dem Weg zu idealen Programmierwerkzeugen --
                  Bestandsaufnahme und Ausblick},
  journal =	 {Informatik-Spektrum},
  year =	 {2008},
  pages =	 {580--590},
  volume =	 {31},
  number =	 {6},
  annote =	 {A critique of existing programming languages and
                  tools, and suggestions for future programming
                  languages.  The content appears more relevant than
                  most other papers of its kind \cite{wirth06}, but it
                  is also much less concrete (especially in the
                  critique part).  As for the suggestions, they
                  envision a dynamic language with some (unexplained)
                  provisions for supporting stable and safe end
                  products; they also envision a Smalltalk-style
                  really integrated development environment (instead
                  of the add-on style of Eclipse); and they envision
                  that the language/tool supports a large number of
                  DSLs in order to bridge the semantic gap.}
}

@InProceedings{prokopski&verbrugge08,
  author = 	 {Gregory B. Prokopski and Clark Verbrugge},
  title = 	 {Compiler-Guaranteed Safety in Code-Copying Virtual
                  Machines},
  booktitle =	 {Compiler Construction (CC'08)},
  pages =	 {163--177},
  year =	 {2008},
  publisher =	 {Springer LNCS 4959},
  OPTannote = 	 {}
}

@InProceedings{otto+09europar,
  author = 	 {Frank Otto and Victor Pankratius and Walter F. Tichy},
  title = 	 {{XJava}: Exploiting Parallelism with Object-Oriented Stream Programming},
  booktitle =	 {Euro-Par '09},
  pages =	 {875--886},
  year =	 {2009},
  publisher =	 {Springer LNCS~5704},
  OPTannote = 	 {}
}
