@string(ai="Artificial Intelligence") @string(byte = "BYTE") @string(can="Computer Architecture News") @string(complang="Computer Languages") @string(ieeemicro="IEEE Micro") @string{ieeecomputer="Computer"} @string(jfar="Journal of Forth Application and Research") @string(sigart="SIGART Newsletter") @string(sigforth="SigForth Newsletter") @string{sigplan="SIGPLAN Notices"} @string{spe="Software---Practice and Experience"} @string(iclp3 = "Logic Programming: Third International Conference") @string(iclp6 = "Logic Programming: Sixth International Conference") @string{pacmpl = "Proc. ACM Program. Lang."} @incollection(peirera84, author="Peirera, L. M.", title="Logic Control with Logic", booktitle="Implementations of Prolog", editor="Campbell, J. A.", publisher="Ellis-Horwood", year="1984", pages="177--193" ) @mastersthesis(pichler85, author="Christian Pichler", title="{Prolog-\"{U}bersetzer}", school="{Technische Universit\"{a}t Wien}", year="1985" ) @book(pangratz86, author="H. Pangratz", title="{Logische Schaltkreise}", publisher="{Technische Universit\"at Wien, Institut f\"{u}r Datenverarbeitung}", year="1986" ) @book(vanhentenryck89, Author = "Van Hentenryck, Pascal", Title = "{Constraint Satisfaction in Logic Programming}", Series = "{Logic Programming Series}", Year = "1989", Publisher= "MIT Press", Address = "Cambridge, Massachusetts" ) @phdthesis(krall88, author="Andreas Krall", title="{Analyse und Implementierung von Prolog-Systemen}", school="{Technische Universit\"{a}t Wien}", year="1988" ) @inproceedings(jaffar&lassez87b, author="Joxan Jaffar and Jean-Louis Lassez", title="From Unification to Constraints", booktitle="Logic Programming '87", year="1987", editor="K. Furukawa and H. Tanaka and T. Fujisaki", pages="1--18", publisher="Springer LNCS 315" ) @inproceedings(cs-prolog, author="Toshio Kawamura and Hayato Ohwada and Fumio Mizoguchi", title="{CS-Prolog}: A Generalized Unification Based Constraint Solver", booktitle="Logic Programming '87", year="1987", editor="K. Furukawa and H. Tanaka and T. Fujisaki", pages="19--39", publisher="Springer LNCS 315" ) @inproceedings(dechter&pearl88, author="R. Dechter and J. Pearl", title="A Problem Simplification Approach that Generates Heuristics for Constraint-Satisfaction Problems", booktitle="Machine Intelligence 11", year="1988", editor="J. E. Hayes and D. Michie and J. Richards", publisher="Oxford University Press" ) @article(haralick&elliot80, author="Robert M. Haralick and Gordon L. Elliot", title="Increasing Tree Search Efficiency for Constraint Satisfaction Problems", journal=ai, year="1980", volume="14", pages="263--313" ) @inproceedings(jaffar&lassez87a, author="Joxan Jaffar and Jean-Louis Lassez", title="Constraint Logic Programming", booktitle="Fourteenth Annual {ACM} Symposium on Principles of Programming Languages (POPL)", year="1987", pages="111--119", address="M{\"u}nchen" ) @article(sussman&steele80, author="Gerald Jay Sussman and Guy Lewis {Steele Jr.}", title="CONSTRAINTS---A Language for Expressing Almost-Hierarchical Descriptions", journal=ai, year="1980", volume="14", pages="1--39" ) @article(rossi88, author="Francesca Rossi", title="Constraint Satisfaction Problems in Logic Programming", journal=sigart, year="1988", number="106", pages="24--28", month=oct ) @article(freeman-benson+90, author="Bjorn N. Freeman-Benson and John Maloney and Alan Borning", title="An Incremental Constraint Solver", journal=cacm, year="1990", volume="33", number="1", pages="54--63", month=jan ) @inproceedings(carlsson87, author="Mats Carlsson", title="Freeze, Indexing and Other Implementation Issues in the {WAM}", crossref="iclp87", pages="40--58" ) @book(boizumault88, author="Patrice Boizumault", title="Prolog: L'implantation", publisher="Masson", year="1988", address="Paris" ) @book(vancaneghem86, author="Van Caneghem, Michel", title="L'anatomie de Prolog", publisher="InterEditions", year="1986", address="Paris" ) @inproceedings(boizumault86, author="Patrice Boizumault", title="A General Model to Implement {\tt dif} and {\tt freeze}", booktitle=iclp3, year="1986", pages="585--592", address="London", publisher="Springer LNCS 225" ) @techreport(gabriel+85, author="John Gabriel and Tim Lindholm and E. L. Lusk and R. A. Overbeek", title="A Tutorial on the {Warren Abstract Machine} for Computational Logic", institution="Argonne National Laboratory", year="1985", number="ANL-84-84" ) @inproceedings(touati&despain87, author="Herv\'{e} Touati and Alvin Despain", title="An Empirical Study of the Warren Abstract Machine", booktitle="1987 Symposion on Logic Programming", year="1987", pages="114--124", organization="IEEE" ) @book(neumann88, author="Gustaf Neumann", title="Metaprogrammierung und Prolog", publisher="Addison-Wesley", year="1988", series="Internationale Computer-Bibliothek", address="Bonn" ) @incollection(mellish82, author="C. S. Mellish", title="An Alternative to Structure Sharing in the Implementation of a {Prolog} Interpreter", booktitle="Logic Programming", publisher="Academic Press", year="1982", editor="K. L. Clark and S.-A. T{\"a}rnlund", pages="99--106", address="London" ) @inproceedings(bruynooghe86, author="Maurice Bruynooghe and Danny de Schreye and Bruno Krekels", title="Compiling Control", booktitle="1986 Symposium on Logic Programming", year="1986", pages="70--77", organization="IEEE" ) @article(cohen90, author="Jaques Cohen", title="Constraint Logic Programming Languages", journal=cacm, year="1990", volume="33", number="7", pages="52--68", month=jul ) @article(colmerauer90, author="Alain Colmerauer", title="An Introduction to {Prolog III}", journal=cacm, year="1990", volume="33", number="7", pages="69--90", month=jul ) @article(freuder78, author="Eugene C. Freuder", title="Synthesizing Constraint Expressions", journal=cacm, year="1978", volume="21", number="11", pages="958--966", month=nov ) @inproceedings(beringer&porcher89, author="Henri Beringer and Franck Porcher", title="A Relevant Scheme for {Prolog} Extensions: {CLP(Conceptual Theory)}", booktitle=iclp6, year=1989, OPTeditor="Giorgio Levi and Maurizio Martelli", pages="131--148", address="Lissabon" ) @inproceedings(borning+89, author="Alan Borning and Michael Maher and Amy Martindale and Molly Wilson", title="Constraint Hierarchies and Logic Programming", booktitle=iclp6, year=1989, OPTeditor="Giorgio Levi and Maurizio Martelli", pages="149--164", address="Lissabon" ) @inproceedings(vanhentenryck89b, author="Van Hentenryck, Pascal", title="Parallel Constraint Satisfaction in Logic Programming: Preliminary Results of CHIP within PEPSys", booktitle=iclp6, year=1989, OPTeditor="Giorgio Levi and Maurizio Martelli", pages="165--180", address="Lissabon" ) @inproceedings(walinsky89, author="Clifford Walinsky", title="{CLP($\Sigma^*$)}: Constraint Logic Programming with Regular Sets", booktitle=iclp6, year=1989, OPTeditor="Giorgio Levi and Maurizio Martelli", pages="181--196", address="Lissabon" ) @book(lloyd84, author="John Wylie Lloyd", title="Foundataions of Logic Programming", publisher="Springer-Verlag", year="1984", address="Berlin" ) @InProceedings(Dincbas+88, Author = "Dincbas, Mehmet and Van Hentenryck, Pascal and Simonis, Helmut and Aggoun, Abder and Graf, Thomas and Berthier, Fran\c{c}oise", Title ="{The Constraint Logic Programming Language CHIP}", BookTitle="{International Conference on Fifth Generation Computer Systems (FGCS)}", Address="Tokyo", Month=dec, Year="1988" ) @article(cuadrado85, author="Clara Y. Cuadrado and John L. Cuadrado", title="Prolog Goes to Work", journal=byte, year="1985", volume="10", number="8", pages="151--158", month=aug ) @article(kowalski85, author="Robert Kowalski", title="Logic Programming", journal=byte, year="1985", volume="10", number="8", pages="161--177", month=aug ) @article(darlington85, author="John Darlington", title="Program Transformation", journal=byte, year="1985", volume="10", number="8", pages="201--216", month=aug ) @manual(heintze+86, title="The {CLP} Programmer's Manual, Version 1.0", author="Nevin Heintze and Joxan Jaffar and Chean Shen Lim and Spiro Michaylov and Peter Stuckey and Roland Yap and Chut Ngeow Yee", organization="Department of Computer Science, Monash University", address="Australia", year="1986" ) @inproceedings(saraswat87, author="Vijay A. Saraswat", title="{CP} as a General-Purpose Constraint-Language", booktitle=aaai87, year="1987", pages="53--58" ) @Proceedings{aaai87, key = "AAAI-87", booktitle = "{Sixth National Conference on Artificial Intelligence (AAAI)}", title = "{Sixth National Conference on Artificial Intelligence (AAAI)}", year = "1987", } @inproceedings(mcallester90, author="David McAllester", title="Truth Maintenance", booktitle=aaai90, year="1990", pages="1109--1116" ) @Proceedings{aaai90, key = "AAAI-90", booktitle = "{Ninth National Conference on Artificial Intelligence (AAAI-90)}", title = "{Ninth National Conference on Artificial Intelligence (AAAI-90)}", year = "1990", } @article(clocksin87, author="William Clocksin", title="A {Prolog} Primer", journal=byte, year="1987", volume="12", number="9", pages="147--158", month=aug ) @article(lassez87, author="Catherine Lassez", title="Constraint Logic Programming", journal=byte, year="1987", volume="12", number="9", pages="171--176", month=aug ) @article(colmerauer87, author="Alain Colmerauer", title="Opening the {Prolog~III} Universe", journal=byte, year="1987", volume="12", number="9", pages="177--182", month=aug ) @book(naish86, author="Lee Naish", title="Negation and Control in {Prolog}", publisher="Springer LNCS~238", year="1986", ) @book(giannesini+86, author="Fran\c{c}is Giannesini and Henry Kanoui and Robert Pasero and Michel van Caneghem", title="{Prolog}", publisher="Addison-Wesley", year="1986", series="International Computer Science Series" ) @inproceedings(seki&furukawa87, author="Hirohisa Seki and Koichi Furukawa", title="Notes on Transformation Techniques for Generate and Test Logic Programs", booktitle="1987 Symposion on Logic Programming", year="1987", pages="215--223", organization="IEEE" ) @incollection(clark+82, author="K. L. Clark and F. G. McCabe and S. Gregory", title="{IC-Prolog} Language Features", booktitle="Logic Programming", publisher="Academic Press", year="1982", editor="K. L. Clark and S.-A. T{\"a}rnlund", pages="253--266", address="London" ) @mastersthesis(knaus88, author="Bernhard J. Knaus", title="{Prolog Meta-Interpreter}", school="{Technische Universit\"{a}t Wien}", year="1988" ) @book(tick88, author="Evan Tick", title="Memory Performance of {Prolog} Architectures", publisher="Kluwer Academic Publishers", year="1988", address="Boston" ) @article(kowalski79, author="Robert Kowalski", title="Algorithm = Logic + Control", journal=cacm, year="1979", volume="22", number="7", pages="424--436", month=jul ) @mastersthesis(neumerkel89, author="Ulrich Neumerkel", title="{Speicherbereinigung f\"ur Prologsysteme}", school="{Technische Universit\"{a}t Wien}", year="1989" ) @mastersthesis(slany89, author="Wolfgang Slany", title="{Optimierung relationaler Anfragen am Beispiel der ARTHUR Implementierung}", school="{Technische Universit\"{a}t Wien}", year="1989" ) @phdthesis(holzbaur90, author="Christian Holzbaur", title="Implementation of Constraint Based Inference Mechanisms through Extended Unification", school="{Technische Universit\"at Wien}", year="1990" ) @article(lauriere78, author="Jean-Louis Lauriere", title="A Language and a Program for Stating and for Solving Combinatorial Problems", journal=ai, year="1978", volume="10", pages="29--127" ) @article(mackworth77, author="Alan K. Mackworth", title="Consistency in Networks of Relations", journal=ai, year="1977", volume="8", pages="99--118" ) @inproceedings(gaschnig77, author="John Gaschnig", title="A General Backtrack Algorithm That Eliminates Most Redundant Tests", booktitle="5th International Joint Conference on Artificial Intelligence", year="1977", pages="457" ) @unpublished(carlsson90, author="Mats Carlsson", title="Re: Arrays in {Prolog}", note="Usenet News {``1990 Sep 25: 124816.12993@sics.se''}", year="1990" ) @misc(naish90, author="Lee Naish", year="1990", howpublished="E-Mail Korrespondenz" ) @article(nudel83, author="Bernard Nudel", title="Consistent Labeling Problems and their Algorithms: Expected Complexities and Theory-Based Heuristics", journal=ai, year="1983", volume="21", pages="135--178" ) @book(pearl84, author="Judea Pearl", title="Heuristics---Intelligent Search Strategies for Computer Problem Solving", publisher="Addison-Wesley", year="1984" ) @book(kaindl89, author="Hermann Kaindl", title="{Probleml\"osen durch heuristische Suche in der Artificial Intelligence}", publisher="Springer-Verlag", year="1989", address="Wien" ) @article(gardner75, author="Martin Gardner", title="Mathematical Games", journal="Scientific American", year="1975", volume="232", number="4", pages="126--133", month=apr ) @article(kubale&jackowski85, author="Marek Kubale and Bogus{\l}aw Jackowski", title="A General Implicit Enumeration Algorithm for Graph Coloring", journal=cacm, year="1985", volume="28", number="4", pages="412--418", month=apr ) @phdthesis(koza89, author="Christian Koza", title="{Garantiertes Zeitverhalten in verteilten Echtzeitsystemen}", school="{Technische Universit\"{a}t Wien}", year="1989" ) @inproceedings(krall&neumerkel90, author="Andreas Krall and Ulrich Neumerkel", title="The {Vienna Abstract Machine}", booktitle="Programming Language Implementation and Logic Programming (PLILP'90)", year="1990", OPTeditor="P. Deransart and J. Ma{\l}uzy\'nski", pages="121--136", publisher="Springer LNCS~456" ) @inproceedings(neumerkel90, author="Ulrich Neumerkel", title="Extensible Unification by Metastructures", booktitle="Meta-90", year="1990", address="Leuven" ) @techreport(warren83, author="David H. D. Warren", title="An Abstract {Prolog} Instruction Set", institution="SRI International", year="1983", number="309" ) @inproceedings(lim&stuckey90, author="Pierre Lim and Peter J. Stuckey", title="A Constraint Logic Programming Shell", booktitle="Programming Language Implementation and Logic Programming (PLILP'90)", year="1990", OPTeditor="P. Deransart and J. Ma{\l}uzy\'nski", pages="75--88", publisher="Springer LNCS~456" ) @inproceedings(naish86b, author="Lee Naish", title="Negation and Quantifiers in {NU-Prolog}", booktitle=iclp3, year="1986", pages="625--634", address="London", publisher="Springer LNCS 225" ) @article(dewdney86a, author="A. K. Dewdney", title="{Computer-Kurzweil}", journal="Spektrum der Wissenschaft", year="1986", pages="5--11", month=feb ) @article(dewdney86b, author="A. K. Dewdney", title="{Computer-Kurzweil}", journal="Spektrum der Wissenschaft", year="1986", pages="6--10", month=may ) @techreport{graf87, author="Thomas Graf", title="Extending Constraint Handling in Logic Programming to Rational Arithmetic", institution="ECRC", year="1987", type="Internal Report" } @phdthesis(graf89, author="Thomas Graf", title="Raisonnement sur les contraintes en programmation en logique", school="Universit\'{e} de Nice -- Sophia Antipolis", year="1989" ) @article(buttner&simonis87, author="W. Buttner and H. Simonis", title="Embedding Boolean Expressions into Logic Programming", journal="Journal of Symbolic Computation", year="1987", volume="4", pages="191--205", month=oct ) @article(fisher81, author="Joseph A. Fisher", title="Trace Scheduling: A Technique for Global Microcode Compaction", journal=ieeetc, year="1981", volume="30", number="7", pages="478--490", month=jul, annote="Trace Scheduling takes one (often used) path of the program and schedules (compacts) without respect to basic block boundaries. Correctness is ensured by inserting compensation code in adjacent basic blocks. The process is repeated until all basic blocks have been scheduled." ) @article(hu61, author="T. C. Hu", title="Parallel Sequencing and Assembly Line Problems", journal="Operations Research", year="1961", volume="9", number="6", pages="841--848" ) @book(kastens90, author="Uwe Kastens", title="{\"U}bersetzerbau", publisher="R. Oldenbourg Verlag", year="1990", address="M{\"u}nchen" ) @article(dincbas+90, author="Mehmet Dincbas and Helmut Simonis and Van Hentenryck, Pascal", title="Solving Large Combinatorial Problems in Logic Programming", journal="The Journal of Logic Programming", year="1990", number="8", pages="75--93" ) @article(hennessy&gross83, author="John Hennessy and Thomas Gross", title="Postpass Code Optimization of Pipeline Constraints", journal=toplas, year="1983", volume="5", number="3", pages="422--448", month=jul, annote="Discusses instruction scheduling, shows its NP-completeness and presents an heuristic algorithm for instruction scheduling. The algorithm removes 47\% of the NOPs in their example programs (optimum = 54\%)" ) @inproceedings(davidson86, author="Jack W. Davidson", title="A Retargetable Instruction Reorganizer", booktitle="SIGPLAN '86 Symposium on Compiler Construction", year="1986", pages="234--241", annote="Enhances PO (and Davidson-Fraser code generators like the GNU C backend in general) to include evaluation order optimization and targeting. The enhancement can be used for instruction scheduling, too." ) @article(colwell+88, author="Robert P. Colwell and Robert P. Nix and John J. O'Donnel and David B. Papworth and Paul K. Rodman", title="A {VLIW} Architecture for a Trace Scheduling Compiler", journal=ieeetc, year="1988", volume="37", number="8", pages="318--328", month=aug ) @inproceedings(vanhentenryck&dincbas87, author="Van Hentenryck, Pascal and Mehmet Dincbas", title="Forward Checking in Logic Programming", crossref="iclp87", pages="229--256" ) @Proceedings{iclp87, key = "ICLP-4", title = "Fourth International Conference on Logic Programming (ICLP-4)", booktitle = "Fourth International Conference on Logic Programming (ICLP-4)", year = "1987", publisher = "MIT Press" } @manual(motorola90, title="MC88100 RISC Microprocessor User's Manual", organization="Motorola, Inc.", edition="second", year="1990" ) @Article{hatcher91, author = "Philip J. Hatcher", title = "The Equational Specification of Efficient Compiler Code Generation", journal = complang, year = "1991", volume = "16", number = "1", pages = "81--95", annote = "A system (UCG) for the equational specification of code generators is presented. The code generators work by rewriting (explicit) trees with associated actions, similar to Graham-Glanville code generators. The tree rewriting system enables pre-code generation transformations, too. The code generators are about twice as fast as those produced by the Davidson-Fraser and Graham-Glanville approaches and the code generator of pcc." } @Article{dhamdhere88, author = "Dhananjay Madhav Dhamadhere", title = "Register Assignment Using Code Placement Techniques", journal = complang, year = "1988", volume = "13", number = "2", pages = "75--93", annote = "Describes an algorithm that moves Load/Stores to good places. The resulting program is guaranteed not to be slower than the original program. Cites many papaers that are not found in other register allocation literature" } @InProceedings{wall86, author = "David W. Wall", title = "Global Register Allocation at Link Time", booktitle = "SIGPLAN '86 Symposium on Compiler Construction", year = "1986", pages = "264--275", OPTorganization = "ACM SIGPLAN", annote = "Uses annotations from the compiler to do fast interprocedural register allocation at link time. Speedups of 10--20\% are obtained. Most (52--99\%) of the removable memory references are removed. The improvement over intraprocedural coloring allocation is 1--8\%." } @Article{tjaden&flynn70, author = "Garold S. Tjaden and Michael J. Flynn", title = "Detection and Parallel Execution of Independent Instructions", journal = ieeetc, year = "1970", volume = "19", number = "10", pages = "889--895", month = oct, annote = "A hardware method to achieve parallelism on a SISD machine. Simultaneously decodes multiple instructions and executes them with multiple execution units, but keeps dependent instructions in the correct order. Simulations show that 1.86 IBM~7090 instructions can be executed per cycle without compiler assistance." } @InProceedings{gibbons&muchnick86, author = "Phillip B. Gibbons and Steve S. Muchnick", title = "Efficient Instruction Scheduling for a Pipelined Architecture", booktitle = "SIGPLAN '86 Symposium on Compiler Construction", year = "1986", pages = "11--16", OPTorganization = "ACM SIGPLAN", annote = "A heuristic algorithm for instruction scheduling with $O(n^2)$ worst-case and linear observed complexity." } @Article{zima86, author = "Hans P. Zima", title = "A Constraint Language and its Interpreter", journal = complang, year = "1986", volume = "11", number = "2", pages = "65--83", annote = "The language looks pascaloid, supports built-in and user-defined constraints on integer and real variables and contains structuring constructs like arrays and iterators. The interpreter analyses the program and tries to find bindings for all variables. If necessary, the user is requested to input values for selected variables." } @Article{chow&hennessy90, author = "Fred C. Chow and John L. Hennessy", title = "The Priority-Based Coloring Approach to Register Allocation", journal = toplas, year = "1990", volume = "12", number = "4", pages = "501--536", month = oct, annote = "Assigns registers using a priority function. If no registers are available, live ranges are split and move or spill code is inserted. Includes a lot of data." } @Article{davidson&fraser84, author = "Jack W. Davidson and Christopher W. Fraser", title = "Code Selection through Object Code Optimization", journal = toplas, year = "1984", volume = "6", number = "4", pages = "505--526", month = oct, annote = "The code generator emits naive, but executable code as register transfers (a kind of universal assembly), which is transformed into efficient code by a peephole optimizer." } @InProceedings{lam88, author = "Monica Lam", title = "Software Pipelining: An Effective Scheduling Technique for {VLIW} Machines", booktitle = "SIGPLAN '88 Conference on Programming Language Design and Implementation", year = "1988", pages = "318--328", OPTorganization = "ACM SIGPLAN", annote = "Software Pipelining enables the parallel execution of multiple loop iterations by translating loops in a prolog, a steady state and an epilog, in which the pipeline is filled, stays full and is emptied, respectively. The paper also presents hierarchical reduction, which enables the application of software pipelining to loops containing conditional statements." } @InProceedings{fraser&wendt88, author = "Christopher W. Fraser and Alan L. Wendt", title = "Automatic Generation of Fast Optimizing Code Generators", booktitle = "SIGPLAN '88 Conference on Programming Language Design and Implementation", year = "1988", pages = "79--84", OPTorganization = "ACM SIGPLAN", annote = "Nonprocedural specifications for code generator and peephole optimizer are used to compile a testbed of programs. The record of the optimizations during these compilations is used to generate a fast, hard-coded integrated code generator and peephole optimizer." } @InProceedings{chow88, author = "Fred C. Chow", title = "Minimizing Register Usage Penalty at Procedure Calls", booktitle = "SIGPLAN '88 Conference on Programming Language Design and Implementation", year = "1988", pages = "85--94", OPTorganization = "ACM SIGPLAN", annote = "Describes inter-procedural register allocation at compile time based on processing the procedures in a one-pass depth-first traversal. Resorts to the conventional scheme when there is insufficient information (e. g. seperate compilation). Also describes shrink-wrapping of saves and restores to regions of activity to avoid having saves and restores in paths where they are not needed. Interprocedural register allocation has speedups of -2.6--12\%, shrink-wrapping -0.2--2\%, with respect to optimized code with intraprocedural register allocation." } @InProceedings{benitez&davidson88, author = "Manuel E. Benitez and Jack W. Davidson", title = "A Portable Global Optimizer and Linker", booktitle = "SIGPLAN '88 Conference on Programming Language Design and Implementation", year = "1988", pages = "329--338", annote = "A nice treatment of the Davidson-Fraser approach combined with global optimizations. The only thing new seems to be the linker, which does some simple interprocedural optimizations (call/entry streamlining). They buy a speedup of 3--20\% on a VAX and up to 5\% on a SUN~3." } @InProceedings{larus&hilfinger86, author = "James R. Larus and Paul N. Hilfinger", title = "Register Allocation in the SPUR Lisp Compiler", booktitle = "SIGPLAN '86 Symposium on Compiler Construction", year = "1986", pages = "255--263", annote = "Chow's priority-based coloring in the context of register windows." } @InProceedings{fraser&wendt86, author = "Christopher W. Fraser and Alan L. Wendt", title = "Integrating Code Generation and Optimization", booktitle = "SIGPLAN '86 Symposium on Compiler Construction", year = "1986", pages = "242--248", annote = "An improvement on PO: At compile-compile time PO runs on a testbed and generates a fixed set of rules that are used at compile-time by a fast, rule-directed optimizer, that avoids much string scanning." } @InProceedings{bernstein+89, author = "David Bernstein and Dina Q. Goldin and Martin C. Golumbic and Yishay Mansour and Itai Nahshon and Ron Y. Pinter", title = "Spill Code Minimization Techniques for Optimizing Compilers", crossref = "sigplan89", pages = "258--263", annote = "Improves Chaitins algorithm by using three heuristics for using the next variable to spill and choosing the best one with respect to a cost function." } @InProceedings{gupta+89, author = "Rajiv Gupta and Mary Lou Soffa and Tim Steele", title = "Register Allocation Via Clique Separators", crossref = "sigplan89", pages = "264--274", annote = "Graphs are easier to color by decomposing them using clique seperators (cliques that are the only connection of the parts) and coloring the parts. For straight line code the live ranges at any point of time are clique seperators. Branches are handled like in trace scheduling. The technique is claimed as being more efficient at coloring time as well as producing better allocations." } @InProceedings{briggs+89, author = "Preston Briggs and Keith D. Cooper and Ken Kennedy and Linda Torczon", title = "Coloring Heuristics for Register Allocation", crossref = "sigplan89", pages = "275--284", annote = "Improves Chaitins Allocator by using a better coloring algorithm and by deffering the spill decision. The dynamic improvement over Chaitins algorithm is 1\% for large floating-point programs. It increases as the number of registers decreases (15\% dynamic improvement for integer code with 8 registers). The allocation time stays the same." } @InProceedings{emmelmann+89, author = {Helmut Emmelmann and Friedrich-Wilhelm Schr\"oer and Rudolf Landwehr}, title = {{BEG} -- a Generator for Efficient Back Ends}, crossref = "sigplan89", pages = "227--237" } @Proceedings{sigplan89, key = "SIGPLAN~'89", booktitle = "SIGPLAN~'89 Conference on Programming Language Design and Implementation", title = "SIGPLAN~'89 Conference on Programming Language Design and Implementation", year = "1989", } @InProceedings{johnson&miller86, author = "Mark Scott Johnson and Terence C. Miller", title = "Effectiveness of a Machine-Level, Global Optimizer", booktitle = "SIGPLAN '86 Symposium on Compiler Construction", year = "1986", pages = "99--107", annote = "Gives an overview of the back end of the HP Precicion Architecture compilers." } @Article{ganapathi89, author = "Mahadevan Ganapathi", title = "Prolog Based Retargetable Code Generation", journal = complang, year = "1989", volume = "14", number = "3", pages = "193--204", annote = "Code Generation is quite conventional (tree rewriting). Also discusses some optimizations, but I did not get him." } @Article{dhamdhere90, author = "Dhananjay Madhav Dhamdhere", title = "A Usually Linear Algorithm for Register Assignment Using Edge Placement of Load and Store Instructions", journal = complang, year = "1990", volume = "15", number = "2", pages = "83--94", annote = "Improves on his earlier Load/Store-movement algorithm by placing the loads and stores on edges of the flow graph. It produces better code than the older algorithm and runs faster." } @Article{golumbic&rainish90, author = "Martin Charles Golumbic and Vladimir Rainish", title = "Instruction Scheduling Beyond Basic Blocks", journal = ibmjrd, year = "1990", volume = "34", number = "1", pages = "93--97", month = jan, annote = "Discusses some rearrangements that reduce pipeline stalls around branches for the RS/6000. Very processor-specific and not very ingenious." } @Article{warren90, author = "Warren, Jr., Henry S.", title = "Instruction Scheduling for the {IBM RISC System/6000} processor", journal = ibmjrd, year = "1990", volume = "34", number = "1", pages = "85--92", month = jan, annote = "Gives a description of the delays that can happen in the RS/6000. It does the usual scheduling algorithm. One of the secondary heuristics (liveness weight) should reduce register pressure. The scheduling is done before and after register allocation." } @InProceedings{bradlee+91asplos, author = "David G. Bradlee and Susan J. Eggers and Robert R. Henry", title = "Integrating Register Allocation and Instruction Scheduling for {RISCs}", crossref = "asplos91", pages = "122--131", annote = "Compares three strategies: Postpass; a variant of Goodmans and Hsu's Integrated Prepass Scheduling (IPS), where instruction scheduling is performed before register allocation, but considers register allocation; and their own RASE, where the register allocator considers the effects of its choices on the instruction scheduler. Floating-point and integer programs are measured on the 88100, R2000 and i860. IPS and RASE are much better than postpass; RASE is better than IPS in a few cases, but is not cost-effective." } @InProceedings{lee+91, author = "Roland L. Lee and Alex Y. Kwok and Fay\'e A. Briggs", title = "The Floating-Point Performance of a Superscalar {SPARC} Processor", crossref = "asplos91", pages = "28--37", annote = "Compares loop unrolling, software pipelining and the combination for floating point programs on SPARC processors with 1, 2, or 4 instructions/cycle." } @InProceedings{goodman&hsu88, author = "James R. Goodman and Wei-Chung Hsu", title = "Code Scheduling and Register Allocation in Large Basic Blocks", booktitle = "International Conference on Supercomputing", year = "1988", pages = "442--452", annote = "After an overview of the phase ordering problems in instruction scheduling and register allocation two algorithms are introduced: Integrated Prepass Scheduling keeps track of the number of registers left and switches between scheduling to minimize pipeline stalls and scheduling to minimize register usage accordingly. A variation of this algorithm also spills registers in certain circumstances. DAG-Driven Register Allocation is to be used with a postpass scheduler and tries to allocate the registers without generating new dependencies. If this cannot be achieved, the register is chosen in a way that minimizes the path length of the additional paths. The two algorithms (combined with a register allocator and an instruction scheduler, respectively) perform better than the usual prepass, postpass or two-pass approaches." } @TechReport{waltz72, author = "D. Waltz", title = "Generating Semantic Descriptions from Drawings of Scenes with Shadows", institution = "MIT", year = "1972", number = "AI271" } @Book{koopman89, author = "Koopman, Jr., Philip J.", title = "Stack Computers", publisher = "Ellis Horwood Limited", year = "1989", url = "http://www.cs.cmu.edu/~koopman/stack_computers/index.html", annote = "Presents the architecture of recent stack processors coming from the Forth community. Compares stack machines with conventional machines. The results: For real-time applications stack machines are better, because their worst-case performance is higher. Contrary to common opinion stack machines have faster context-switches than conventional machines." } @InProceedings{proebsting&fischer91, author = "Todd A. Proebsting and Charles N. Fischer", title = "Linear-time, Optimal Code Scheduling for Delayed-Load Architectures", crossref = "sigplan91", pages = "256--267", annote = "This algorithm generates optimal schedules with optimal register usage for a very limited problem: binary expression trees on a load/store architecture with a one cycle delay on loads, where the leaf nodes are loads, i.e. no register variables, no constants, no unary operators, no common subexpression elimination, and scheduling is restricted to one statement at a time. It generates optimal register spill code. Also, the (sub-optimal) application of the algorithm to DAGs and longer delays is treated." } @InProceedings{jain91, author = "Suneel Jain", title = "{Circular Scheduling}: A new Technique to Perform Software Pipelining", crossref = "sigplan91", pages = "219--228", annote = "Performs software pipelining by selecting an instruction without predecessors from the dependency DAG of the loop body, moving it to the end of the loop and rescheduling. This step is repeated until the schedule becomes a lot worse. Then the best schedule is used. For compensation the unmoved instructions are used as prolog and the moved ones as epilog. A register renaming algorithm to reduce dependencies introduced by register allocation is introduced, too. The improvement on the Livermore loops is between -5\% and 53\% on the MIPS R6010." } @Article{bernhardsson91, author = "Bo Bernhardsson", title = "Explicit Solutions to the N-Queens Problem for all N", journal = sigart, year = "1991", volume = "2", number = "2", pages = "7", } @Article{sosic&gu91, author = "Rok Sosic and Jun Gu", title = "3,000,000 Queens in Less Than One Minute", journal = sigart, year = "1991", volume = "2", number = "2", pages = "22--24", annote = "This algorithm sets almost all queens in a conflict-free manner, but the last few queens are allowed to have diagonal conflicts. These conflicts are then eliminated by selecting two queens (one systematically and one randomly) and swapping their columns if this improves the state of affairs. The algorithm runs in linear time." } @Article{gu91, author = "Jun Gu", title = "On a General Framework for Large-Scale Constraint-Based Optimization", journal = sigart, year = "1991", volume = "2", number = "2", pages = "8", annote = "An answer to \cite{bernhardsson91}" } @InProceedings{morris91, author = "W. G. Morris", title = "{CCG}: A Prototype Coagulating Code Generator", crossref = "sigplan91", pages = "45--58", annote = "Basic blocks are treated (code generation, register allocation,\ldots) in decreasing order of (measured or estimated) execution frequency. Later blocks have to adapt to decisions made in earlier blocks. Therefore the most frequent blocks can be compiled very well. Procedure calls are treated like other control flow constructs, resulting in automatic interprocedural optimization. The average speedup on gcc is 25\% on a Sun~3, but I had the impression that they used the same benchmarks for development and the measurements. The speedups are mainly from smaller procedure call overhead, so they are twice as valuable. There was apparently no separate compilation." } @InProceedings{wall91pldi, author = "David W. Wall", title = "Predicting Program Behaviour Using Real or Estimated Profiles", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "59--70", address = "Toronto", journal = sigplan, OPTvolume = "26", OPTnumber = "6", OPTmonth = jun, annote = "Checks estimated profiles and real profiles measured on different input data for their accuracy and relevance. Real profiles prove best, estimated profiles are better than random profiles in most cases. The best estimated profiles are based on loop nesting level combined with static call counts." } @InProceedings{mcfarling91, author = {Scott McFarling}, title = {Procedure Merging with Instruction Caches}, crossref = {sigplan91}, pages = {71--79}, annote = {Inlining is steered by a heuristic that considers the I-cache size; in particular, it tries to fit loops into the cache. The empirical part compares this new heuristics with heuristics based on size and heuristics based on the ratio of size and dynamic execution counts.} } @InProceedings{jaffar+91, author = "Joxan Jaffar and Spiro Michaylov and Roland H. C. Yap", title = "A Methodology for Managing Hard Constraints in CLP Systems", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "306--316", address = "Toronto", journal = sigplan, volume = "26", number = "6", month = jun, annote = "Describes the delay mechanism of CLP($\cal R$). Delayed constraints are pushed on a special stack; every time one of their responsible variables is changed, their new state is pushed. Responsible variables are connected to the constraints through a complicated access structure. Backtracking is done by popping the constraints and rebuilding the access structures (no trailing and untrailing)." } @InProceedings{pugh91, author = "William Pugh", title = "Advice to Authors of Extended Abstracts", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "353-356", OPTaddress = "Toronto", journal = sigplan, OPTvolume = "26", OPTnumber = "6", OPTmonth = jun, annote = "How to get your paper accepted." } @InProceedings{bradlee+91pldi, author = "David G. Bradlee and Robert R. Henry and Susan J. Eggers", title = "The {Marion} System for Retargetable Instruction Scheduling", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "229--240", address = "Toronto", OPTjournal = sigplan, OPTvolume = "26", OPTnumber = "6", OPTmonth = jun, annote = "A back end generator for RISCs, consisting of simple instruction selection and several strategies for the combination of register allocation and instruction scheduling (Postpass, IPS, RASE). The maschine description describes the resources needed by an instruction and contains means to describe explicitely advanced pipelines (i860), too. Machine descriptions were developed for the 88100, the R2000 and the i860. The quality of the generated code is between the MIPS compiler's -O1 and -O2 levels." } @Proceedings{sigplan91, key = "SIGPLAN~'91", booktitle = "SIGPLAN~'91 Conference on Programming Language Design and Implementation", title = "SIGPLAN~'91 Conference on Programming Language Design and Implementation", year = "1991", OPTaddress = "Toronto", OPTjournal = sigplan, OPTvolume = "26", OPTnumber = "6", OPTmonth = jun, } @InProceedings{chang+91, author = "Pohua P. Chang and Scott A. Mahlke and William Y. Chen and Nancy J. Warter and {Wen-mei} W. Hwu", title = "{IMPACT}: An Architectural Framework for Multiple-Instruction-Issue Processors", crossref = "isca91", pages = "266--275", annote = "Describes optimizing compiler, that uses trace scheduling to increase instrcution level parallelism. They call their approach general percolation. Empirical results are presented: Their software approach does nearly as well as speculative execution, a hardware method, if trap-generation by division by zero or illegal memory references is disabled. If such instructions can produce traps, they can not be moved as much (restricted percolation), resulting in noticably lower performance." } @InProceedings{butler+91, author = "Michael Butler and Tse-Yu Yeh and Yale Patt and Mitch Alsup and Hunter Scales and Michael Shebanow", title = "Single Instruction Stream Parallelism Is Greater Than Two", crossref = "isca91", pages = "276--286", annote = "A study on how much instruction-level parallelism can be achieved on different hardware models. Code produced from a conventional 88k-compiler is used, but better hardware featuring dynamic scheduling and speculative execution is simulated. If data dependencies are the only source of serialization, the SPEC suite has a parallelism of 17--1165. On realistic machine having eight fuctional units integer parallelism is 2.4--3.4, floating point 1.9--5.8 without considering cache misses. The paper identifies the bottlenecks in the simulated architectures." } @InProceedings{melvin&patt91, author = "Stephen Melvin and Yale Patt", title = "Exploiting Fine-Grain Parallelism Through a Combination of Hardware and Software Techniques", crossref = "isca91", pages = "287--296", annote = "The parallelism available through dynamic scheduling and speculative execution can be enhanced with basic block enlargement, a technique similar to trace scheduling combined with loop unrolling." } @InProceedings{quammen&miller91, author = "Donna J. Quammen and P. Richard Miller", title = "Flexible Register Management for Sequential Programs", crossref = "isca91", pages = "320--329", annote = "Describes a more flexible register window mechanism (threaded or t-windows), where windows can be allocated freely. This can be used to simulate a single register set, usual register windows, and register stack. The main advantage of this scheme is in concurrent systems, where you can use a window stack for every process." } @InProceedings{bradlee+91isca, author = "David G. Bradlee and Susan J. Eggers and Robert R. Henry", title = "The Effect on RISC Performance of Register Set Size and Structure Versus Code Generation Strategy", crossref = "isca91", pages = "330--339", annote = "Machines with a single register set and slow floating point units (like 88100) are compared to machines with separate register sets and a fast FP unit (e.g. R3000). Machines with fast FP should use the FP unit for integer multiply even if they have a split register set." } @Proceedings{isca91, key = "ISCA-18", booktitle = "The $18^{th}$ Annual International Symposium on Computer Architecture (ISCA)", title = "The $18^{th}$ Annual International Symposium on Computer Architecture (ISCA)", year = "1991", address = "Toronto", journal = can, OPTvolume = "19", OPTnumber = "3", OPTmonth = may, } @Book{hennessy&patterson90, author = "John L. Hennessy and David A. Patterson", title = "Computer Architecture. A Quantitative Approach", publisher = "Morgan Kaufman Publishers", year = "1990", } @InProceedings{callahan&koblenz91, author = "David Callahan and Brian Koblenz", title = "Register Allocation via Hierarchical Graph Coloring", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "192--203", address = "Toronto", journal = sigplan, OPTvolume = "26", OPTnumber = "6", OPTmonth = jun, annote = "The register allocator builds a tile tree representing the conrol hierarchy (loops, ifs, etc.) of the procedure. Then the tiles are colored bottom-up, using Chaitins algorithm for every tile. At last, spill code is inserted on rarely used tile boundaries. It sounds very good, but I miss an empirical comparison with other approaches." } @InProceedings{bernstein&rodeh91, author = "David Bernstein and David Rodeh", title = "Global Instruction Scheduling for Superscalar Machines", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "241--255", address = "Toronto", journal = sigplan, OPTvolume = "26", OPTnumber = "6", OPTmonth = jun, annote = "Describes instruction scheduler that moves instructions between basic blocks. It can even move instructions to places where they are not always useful (speculative execution at compile-time). Using this scheduler increases compile-time by 12\%--17\% and decreases run-time by 0\%--7\% when compared with the usual RS/6000 compiler." } @InProceedings{freeman&pfening91, author = "Tim Freeman and Frank Pfening", title = "Refinement Types for ML", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "268--277", address = "Toronto", journal = sigplan, volume = "26", number = "6", month = jun, annote = "Adds (explicitely defined) subtypes to MLs type system, which can help the static type checking." } @InProceedings{cartwright&fagan91, author = "Robert Cartwright and Mike Fagan", title = "Soft Typing", booktitle = "SIGPLAN '91 Conference on Programming Language Design and Implementation", year = "1991", pages = "278--292", address = "Toronto", journal = sigplan, volume = "26", number = "6", month = jun, annote = "Expands MLs type system to gain expressive power. Not all programs using the extended type system can be type-checked at compile time, so run-time checks are inserted. The goals in designing the type-system were to have implicit typing and to be able to check the majority of the programs at compile-time, so the user takes warnings about inserted run-time checks seriously." } @Article{grabienski91, author = "Peter Grabienski", title = "A Stack-Oriented Multiprocessing System", journal = can, year = "1991", volume = "19", number = "1", pages = "120--127", month = mar, annote = "Combines Forth processor with links similar to a transputer. Unlike transputers, the links are byte-wide (10 MB/s) and use hardware routing." } @InProceedings{cytron+89, author = "Ron Cytron and Jeanne Ferrante and Barry K. Rosen and Mark N. Wegman and F. Kenneth Zadeck", title = "An Efficient Method of Computing Static Single Assignment Form", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "25--35", address = "Austin, Texas", annote = "Static Single Assignment (SSA) form is equivalent to a data flow graph in basic blocks and $\phi$-Functions for merging of data flow edges, when control flow merges. This form is useful for optimization (they give many references). In this article an algorithm for computing the SSA form is presented. It is shown that the algorithm is linear with the size of the program on programs using if- and while-structures. Empirical data on irreducible programs suggests that this is usually also true for these programs." } @InProceedings{sagiv+89, author = "Resolving Circularity in Attribute Grammars with Applications to Data Flow Analysis", title = "S. Sagiv and O. Edelstein and N. Francez and M. Rodeh", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "36--48", address = "Austin, Texas", annote = "How to transform circular attributed grammars, where a unique fixed point can be computed, into noncircular AGs. Describe the applications of such AGs to data flow analysis." } @InProceedings{wadler&blott89, author = "Philip Wadler and Stephen Blott", title = "How To Make {\em ad-hoc} Polymorphism Less {\em ad hoc}", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "60--76", address = "Austin, Texas", annote = "Describe type classes, a generalization of ML's eqtype variables. A type class is a set of types, which have some user-defined operations. E.g. Num = (+), (*), Negate; Instances: Int, Float. Extensible to arbitrary types, the operations must be defined for every type. For convenience there are also subclasses. The translation into an SML-like language is described (it works with a method dictionary). Type classes result in a lot of writing." } @InProceedings{kanellakis&mitchell89, author = "Paris C. Kanellakis and John C. Mitchell", title = "Polymorphic unification and {ML} typing", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "105--115", address = "Austin, Texas", annote = "Proves that polymorphic unification, i.e. ML typing is PSPACE hard. The complexity comes from {\tt let}. However, practical programs with many {\tt let}s can be typed without problems." } @InProceedings{yelick&zachary89, author = "Katherine A. Yellick and Joseph L. Zachary", title = "Moded Type Systems for Logic Pogramming", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "116-124", address = "Austin, Texas", annote = "Describes a mode system for equational logic programming languages, that narrows the gap between declarative and procedural semantics. It is shown that two predicate implementations with the same declarative meaning will be operationally equivalent." } @InProceedings{hickey89, author = "Timothy J. Hickey", title = "{CLP*} and Constraint Abstraction", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "125--133", address = "Austin, Texas" } @InProceedings{cardelli+89, author = "Luca Cardelli and Jim Donahue and Mick Jordan and Bill Kalsow and Greg Nelson", title = "The {Modula-3} Type System", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "202--212", address = "Austin, Texas", annote = "The subtype relation is central to the system. The unusual parts are traced references (garbage collection), Exceptions as part of the procedure type and the global use of structural equivalence. Pointers and Arrays can be assigned to subtypes, the correctness of the operation is checked at run time. Objects are records, methods are procedure variables with self as first operand; the actual method (procedure) is determined at object creation time (new)." } @InProceedings{abadi+89, author = "Mart\'in Abadi and Luca Cardelli and Benjamin Pierce and Gordon Plotkin", title = "Dynamic Typing in a Statically-Typed Language", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "213--227", address = "Austin, Texas", annote = "Introduce a type {\tt Dynamic} and describe its semantics. The handling of the type is somewhat tiring." } @InProceedings{parker89, author = "D. Stott Parker", title = "Partial Order Programming", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "260--266", address = "Austin, Texas", annote = "Describes a framework for several kinds of problems, among them constraint satisfaction problems, and some properties of the framework." } @InProceedings{kelsey&hudak89, author = "Richard Kelsey and Paul Hudak", title = "Realistic Compilation by Program Transformation", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "281--292", address = "Austin, Texas", annote = "A back end, using lambda calculus with an implicit store as intermediate language. The phases are: making the program linear (no nested expressions), adding explicit continuations, simplifying, adding environments, identifier renaming/register allocation. Not completely denotational. The code produced is about as good as that of the Apollo Pascal compiler." } @InProceedings{appel&jim89, author = "Andrew W. Appel and Trevor Jim", title = "Continuation-Passing, Closure-Passing style", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "293--302", address = "Austin, Texas", annote = "ML compiler based on continuation passing style. After the translation into CPS and optimization closures are made explicit and registers are allocated. No stack is used for the closures. Instead the compiler relies on garbage collection." } @InProceedings{pugh&teitelbaum89, author = "William Pugh and Tim Teitelbaum", title = "Incremental Computation via Function Caching", booktitle = "Sixteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1989", pages = "315--328", address = "Austin, Texas" } @InCollection{mccarthy81, author = {John McCarthy}, title = {History of {LISP}}, crossref = {hopl81}, pages = {173--197}, url = {http://jmc.stanford.edu/articles/lisp/lisp.pdf}, OPTannote = {} } @Book{hopl81, title = "History of Programming Languages", booktitle = "History of Programming Languages", publisher = "Academic Press", year = "1981", editor = "Richard L. Wexelblatt", } @Article{kuga+91, author = "Morihiro Kuga and Kazuki Murakami and Shinji Tomita", title = "{DSNS} (dynamically-hazard-resolved, statically-code-scheduled, nonuniform superscalar). {Yet} Another Superscalar Processor Architecture", journal = can, year = "1991", volume = "19", number = "4", pages = "14--29", month = jun, annote = "Features several sorts of load instructions differing in the execution order they require (strongly, weakly and un-ordered). The specific load instructions are selected by the compiler using aliasing information." } @Article{ponder91, author = "Carl Ponder", title = "Performance Variation Across Benchmarks", journal = can, year = "1991", volume = "19", number = "4", pages = "30--36", month = jun, annote = "Compares how much various benchmarks differ when run on two machines. Livermore loops, Dhry- and Whetstone are better than their reputation, application benchmarks can be deceptive, too." } @Article{conte&hwu91, author = "Thomas M. Conte and {Wen-mei} Hwu", title = "A Brief Survey of Benchmark Usage in the Architecture Community", journal = can, year = "1991", volume = "19", number = "4", pages = "37--44", month = jun, annote = "Takes the papers of the ISCAs '84--'90, classifies the used Benchmarks, and analyses the usage patterns." } @Article{fraser&hanson91a, author = "Christopher W. Fraser and David R. Hanson", title = "A Code Generation Interface for {ANSI C}", journal = spe, year = "1991", volume = "21", number = "9", pages = "963--988", month = sep, annote = "Describes call-interface and intermediate code (DAGs) for a compiler that passes information between phases through memory. Very practice-oriented (listings etc.)" } @Article{fraser&hanson91b, author = "Christopher W. Fraser and David R. Hanson", title = "A Retargetable Compiler for {ANSI C}", journal = sigplan, year = "1991", volume = "26", number = "10", pages = "29--43", month = oct, annote = "Describes a fast C compiler and how they made it fast." } @Article{landskov+80, author = "David Landskov and Scott Davidson and Bruce Shriver and Pattrick W. Mallet", title = "Local Microcode Compaction Techniques", journal = acmcs, year = "1980", volume = "12", number = "3", pages = "261--294", month = sep } @Article{rodriguez90, author = "Brad Rodriguez", title = "A {BNF} Parser in {Forth}", journal = sigforth, year = "1990", volume = "2", number = "2", pages = "13--15", month = dec, url = "http://www.forth.org/bnfparse.html", annote = "Describes top-down parsing with backtracking in Forth; includes a listing." } @InProceedings{kessler+91, author = "C. W. Ke{\ss}ler and W. J. Paul and T. Rauber", title = "A Randomized Heuristic Approach to Register Allocation", crossref = "plilp91", pages = "195--206" } @Proceedings{plilp91, key = "PLILP'91", title = "Programming Language Implementation and Logic Programming (PLILP)", booktitle = "Programming Language Implementation and Logic Programming (PLILP)", year = "1991", OPTeditor = "Jan Ma{\l}uszy\'nski and Martin Wirsing", publisher = "Springer LNCS~528", address = "Passau" } @InProceedings{haberler&ertl89, author = "Michael Haberler and Martin Ertl", title = "Offloading A Mainframe or Teaching A Spreadsheet How To Access Big Databases", booktitle = "EUUG Autumn '89 Conference", year = "1989", pages = "45--49", address = "Wien" } @Article{ertl91, author = "M. A. Ertl", title = "{Kurzfassung der Diplomarbeit ``Coroutining und Constraints in der Logik-Programmierung''}", journal = "{\"{O}GAI Journal}", year = "1991", volume = "9", number = "4", pages = "12--20", month = mar } @TechReport{fraser&hanson90, author = "Christopher W. Fraser and David R. Hanson", title = "A Code Generation Interface for ANSI C", institution = "AT&T Bell Laboratories", year = "1990", type = "Research Report", number = "CS-TR-270-90", note = "Revision of September 1991", annote = "The interface of the {\tt lcc} front end to the back end consists of a few functions that are called and data structures for expression dags and symbols. The interface is explained by using a simple vax back end as an example." } @TechReport{rodriguez89, author = "Brad Rodriguez", title = "Moving {Forth}: Principles of Metacompilation", institution = "T-Recursive Technology", year = "1989", address = "55 McCaul St. \#14, Toronto, Ontario M5T 2W7 Canada", url = "http://www.zetetics.com/bj/papers/", annote = "Forth Metacompilation is the art of creating a new Forth System (possibly for a different machine) on the current one. This paper explains the concepts quite well." } @InProceedings{pelegri-llopart&graham88, author = "Eduardo Pelegr\'\i-Llopart and Susan L. Graham", title = "Optimal Code Generation for Expression Trees: An Application of the {BURS} Theory", booktitle = "Fifteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1988", pages = "294--308", annote = "Describes bottom-up rewrite systems. The first half is very dry and theoretical (definitions and propositions). The second half gives empirical results for applying BURS to code selection and makes comparisons with other approaches." } @Article{aho+77, author = "A. V. Aho and S. C. Johnson and J. D. Ullman", title = "Code Generation for Expressions with Common Subexpressions", journal = jacm, year = "1977", volume = "24", number = "1", pages = "146--160", month = jan, annote = "Shows that optimal copy generation in dags for two-address machines is NP-complete. Contrary to popular belief, this paper does not show, that optimal instruction selection in dags is NP-complete in general. Also discussed are linear-time code selection algorithms and their deviations from optimality and optimal algorithms and their complexity." } @InProceedings{chase87, author = "David R. Chase", title = "An Improvement To Bottom-up Tree Pattern Matching", booktitle = "Fourteenth Annual {ACM} Symposium on Principles of Programming Languages", year = "1987", pages = "168--177", } @Manual{fraser+91, title = "{\sc Burg} --- Fast Optimal Instruction Selection and Tree Parsing", author = "Christopher W. Fraser and Robert R. Henry and Todd A. Proebsting", year = "1991", url = "ftp://kaese.cs.wisc.edu/pub/burg.shar.Z", annote = "A BURS tree pattern matcher generator." } @Article{fraser+92, author = "Christopher W. Fraser and Robert R. Henry and Todd A. Proebsting", title = "{\sc Burg} --- Fast Optimal Instruction Selection and Tree Parsing", journal = sigplan, year = "1992", volume = "27", number = "4", pages = "68-76", month = apr, url = "ftp://kaese.cs.wisc.edu/pub/burg.shar.Z" } @string{loplas = "ACM Letters on Programming Languages and Systems"} @Article{fraser+93, author = "Christopher W. Fraser and David R. Hanson and Todd A. Proebsting", title = "Engineering a simple, efficient code generator generator", journal = loplas, year = "1993", OPTvolume = "", OPTnumber = "", OPTpages = "", OPTmonth = "", url = "ftp://ftp.cs.princeton.edu/pub/iburg.tar.Z" } @Manual{bradley87, title = "68000 Unix Forth-83", author = "Mitch Bradley", year = "1987", note = "Available via ftp, newer versions commercial" } @Manual{patel90, title = "TILE Release 2.1", author = "Mikael Patel", year = "1990", note = "Available via ftp from any GNU archive site" } @InProceedings{wall91asplos, author = "David W. Wall", title = "Limits of Instruction-Level Parallelism", crossref = "asplos91", pages = "176--188", annote = "Compares a 64-skalar computer with perfect branch and jump prediction, alias analysis and register renaming, and more realistic alternatives." } @InProceedings{bhandarkar&clark91, author = "Dileep Bhandarkar and Douglas W. Clark", title = "Performance from Architecture: Comparing a RISC and a CISC with Similar Hardware Organization", crossref = "asplos91", pages = "310--319" } @InProceedings{Appel&Li91, author = "Andrew W. Appel and Kai Li", title = "Virtual Memory Primitives for User Programs", crossref = "asplos91", pages = "96--107" } @Proceedings{asplos91, key = "ASPLOS-IV", title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-IV)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-IV)", year = "1991", } @InProceedings{thornton64, author = "J. E. Thornton", title = "Parallel Operation in {Control Data~6600}", booktitle = "AFIPS Fall Joint Computer Conference", year = "1964", pages = "33-40" } @Article{tomasulo67, author = "R. M. Tomasulo", title = "An Efficient Algorithm for Exploiting Multiple Arithmetic Units", journal = ibmjrd, year = "1967", volume = "11", number = "1", pages = "25--33" } @Book{thornton70, author = "J. E. Thornton", title = "Design of a Computer", publisher = "Scott, Foresman", year = "1970", address = "Glenview, Ill." } @Book{aho+86, author = "Alfred V. Aho and Ravi Sethi and Jeffrey D. Ullman", title = "Compilers. Principles, Techniques, and Tools", publisher = "Addison-Wesley", year = "1986" } @Book{ellis85, author = "John R. Ellis", title = "Bulldog: A Compiler for {VLIW} Architectures", publisher = "MIT Press", year = "1985", annote = "The compiler uses traditional optimizations, trace scheduling, memory-reference disambiguation, and memory-bank disambiguation. The main problem of the code generator is to place operations on the right node (functional unit®ister bank) to reduce movements between nodes and functional unit contention. Measurements are made on numeric programs. On most programs a good speedup is achieved, but sometimes the speedup is low or non-existent." } @InCollection{gross&ward91, author = "T. Gross and M. Ward", title = "The Suppression of Compensation Code", crossref = "nicolau+91", year = "1991", pages = "260--273", annote = "Presents an algorithm to suppress the redundant compensation code that na{\"i}ve trace scheduling produces when it moves code across an if-statement. This algorithm is more general than the solution proposed in \cite{ellis85}. Measurements show a reduction of 0\%--95\% in compensation code." } @InCollection{aiken&nicolau91, author = "A. Aiken and A. Nicolau", title = "A Realistic Resource-Constrained Software Pipelining Algorithm", crossref = "nicolau+91", year = "1991", pages = "274--290", } @InCollection{larus91, author = "J. R. Larus", title = "Parallelism in Numeric and Symbolic Programs", crossref = "nicolau+91", year = "1991", pages = "331--349", annote = "Symbolic (i.e. nonnumeric programs like {\tt gcc}) profit only a little from loop parallelization techniques developed for numeric programs." } @InCollection{pingali+91, author = "K. Pingali and M. Beck and R. Johnson and M. Moudgill and P. Stodghill", title = "Dependence Flow Graphs: an Algebraic Approach to Program Transformation", crossref = "nicolau+91", year = "1991", pages = "445--467", annote = "Introduces Dependence Flow Graphs, a new intermediate representation designed to facilitate optimization and demonstrates its advantages using constant propagation as an example. Dependence Flow graphs are similar to data flow graphs, but include memory manipulation operators and represents loops explicitely." } @Book{nicolau+91, title = "Advances in Languages and Compilers for Parallel Processing", booktitle = "Advances in Languages and Compilers for Parallel Processing", publisher = "Pitman", year = "1991", editor = "Alexandru Nicolau and David Gelernter and Thomas Gross and David Padua", series = "Research Monographs in Parallel and Distributed Programming", address = "London", } @Article{dongarra&jinds79, author = "J. J. Dongarra and A. R. Jinds", title = "Unrolling Loops in {Fortran}", journal = spe, year = "1979", volume = "9", number = "3", pages = "219--226", month = mar } @InProceedings{linn83, author = "Joseph L. Linn", title = "{SRDAG} compaction --- A Generalization of Trace Scheduling to Increase the Use of Global Context Information", booktitle = "MICRO-16, The $16^{\it th}$ Annual Microprogramming Workshop", year = "1983", pages = "11--22", annote = "Generalizes Trace Scheduling to work on singly rooted DAGs. The root basic block is compacted, then a new SRDAG is selected and its root is compacted. Unfortunately the paper contains only theoretical results on the performance of the algorithm." } @InProceedings{smotherman+91, author = "Mark Smotherman and Sanjay Krishnamurthy and P. S. Aravind and David Hunnicutt", title = "Efficient {DAG} Construction and Heuristic Calculation for Instruction Scheduling", booktitle = "MICRO-24, $24^{\it th}$ Annual Intl. Symp. on Microarchitecture", year = "1991", pages = "93--102", annote = "Gives an overview of the heuristics used in papers on instruction scheduling and classifies, how they can be computed (in a forward or backward pass or during scheduling). Compares algorithms for building the dependence DAG ($n^2$ and table-building) and shows the interactions between DAG-building and heuristic-computation." } @Article{freiburghouse74, author = "R. A. Freiburghouse", title = "Register Allocation Via Usage Counts", journal = cacm, year = "1974", volume = "17", number = "11", pages = "638--642", month = nov } @Article{adam+74, author = "Thomas L. Adam and K. M. Chandy and J. R. Dickson", title = "A Comparison of List Schedules for Parallel Processing Systems", journal = cacm, year = "1974", volume = "17", number = "11", pages = "685--690", month = dec } @InProceedings{smith+90, author = "Michael D. Smith and Monica S. Lam and Mark A. Horowitz", title = "Boosting Beyond Static Scheduling in a Superscalar Processor", crossref = "isca90", pages = "344--354", annote = "Proposes the use of static scheduling and hardware backup for speculative execution. Every instruction has a tag that indicates wheter it is executed speculatively. Looks as if the shadow registers have to be addressed explicitely. The branch instruction the commits or squashes the results of speculative execution. They call this technique boosting. They compare this scheme with one-branch speculation to a dynamically scheduled machine, assuming no alias detection in both cases. Boosting is slightly better, although their scheduler was quite restricted." } @Proceedings{isca90, key = "ISCA-17", booktitle = "The $17^{th}$ Annual International Symposium on Computer Architecture (ISCA)", title = "The $17^{th}$ Annual International Symposium on Computer Architecture (ISCA)", year = "1990", OPTaddress = "Seattle", OPTmonth = jun, } @InProceedings{smith+89, author = "Michael D. Smith and Mike Johnson and Mark A. Horowitz", title = "Limits on Multiple Instruction Issue", crossref = "asplos89", pages = "290--302", annote = "Evaluates the instruction-level parallelism available in various superscalar designs with hardware scheduling and speculative execution. They emphasize that instruction fetch is the worst bottleneck. This result is questionable, as they employ only branch prediction, but no branch target buffering (although they call their prediction branch target buffer)." } @InProceedings{jouppi&wall89, author = "Norman P. Jouppi and David W. Wall", title = "Available Instruction-Level Parallelism for Superscalar and Superpipelined Machines", crossref = "asplos89", pages = "272--282", annote = "Presents a nice framework for understanding instruction-level parallelism. Current Risc Processors already exploit a certain amount of parallelism, e.g. when loading and branching. They measure instruction-level parallelism, but obviously only for basic block scheduling. Therefore they get a discouraging result. They also measure the effect of optimizations on parallelism. They seem to be quite neutral." } @Proceedings{asplos89, key = "ASPLOS-III", title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-III)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-III)", year = "1989", } @Article{popescu+91, author = "Val Popescu and Merle Schulz and John Spracklen and Gary Gibson and Bruce Lightner and David Isaman", title = "The {Metaflow} Architecture", journal = ieeemicro, year = "1991", pages = "10--13, 63--73", month = jun, annote = "Introduces hardware scheduling etc. Then a microarchitecture to achieve it is described. It is quite similar to \cite{sohi&vajapeyam87}, but has different register renaming, that lends itself better to speculative execution. At last they describe the realisation in the Lightning processor, a 4-scalar SPARC." } @InProceedings{sohi&vajapeyam87, author = "Gurindar S. Sohi and Sriram Vajapeyam", title = "Instruction Issue Logic for High-Performance, Interruptable Pipelined Processors", crossref = "isca87", pages = "27--34", note = "Newer version: \cite{sohi90}", annote = "Discuss a microarchitecture for superscalar execution. They explain it by transforming Tomasulo's solution into their's. Their solution to precise interrupts is to retire the instructions in order, i.e. write the results back in order. This also provides for speculative execution. Simulations show that a CRAY-1 with these mechanisms, but no other additional hardware is 51\% faster (window size 20). Leaving out the most expensive part drops the speedup to 31\%." } @Article{sohi90, author = "Gurindar S. Sohi", title = "Instruction Issue Logic for High-Performance, Interruptable, Multiple Functional Unit, Pipelined Processors", journal = ieeetc, year = "1990", volume = "39", number = "3", pages = "349--359", month = mar } @InProceedings{su&ding85, author = "Bogong Su and Shiyuan Ding", title = "Some Experiments in Global Microcode Compaction", crossref = "micro85", pages = "175--180", annote = "Describes a few global compaction techniques and compares them by compacting a few example programs." } @InProceedings{patt+85a, author = "Yale N. Patt and {Wen-mei} Hwu and Michael Shebanow", title = "{HPS}, a New Microarchitecture: Rationale and Introduction", crossref = "micro85", pages = "103--108", annote = "CISC instructions are decoded into RISC instructions, which are executed in parallel using dynamic scheduling etc. This microengine is presented as a restricted data flow machine." } @InProceedings{patt+85b, author = "Yale N. Patt and Stephen W. Melvin and {Wen-mei} Hwu and Michael C. Shebanow", title = "Critical Issues Regarding {HPS}, a High Performance Microarchitecture", crossref = "micro85", pages = "109--116", annote = "Discusses in depth some of the issues in dynamic scheduling hardware." } @Proceedings{micro85, key = "MICRO-18", booktitle = "The $18^{th}$ Annual Workshop on Microprogramming (MICRO-18)", title = "The $18^{th}$ Annual Workshop on Microprogramming (MICRO-18)", year = "1985", } @InProceedings{hwu&patt87isca, author = "{Wen-mei} Hwu and Yale N. Patt", title = "Checkpoint Repair for Out-of-order Execution Machines", crossref = "isca87", pages = "18--26", note = "Newer version: \cite{hwu&patt87ieeetc}", annote = "Describes design issues in checkpoint mechanisms for precise interrupts and speculative execution. Their design uses backup register files and difference techniques for main memory. Instructions can be retired out-of-order, avoiding full window conditions." } @Article{hwu&patt87ieeetc, author = "{Wen-mei} Hwu and Yale N. Patt", title = "Checkpoint Repair for High-Performance Out-of-order Execution Machines", journal = ieeetc, year = "1987", volume = "36", number = "12", pages = "1496--1514", month = dec } @Article{anathana&long90, author = "Kasi Anathana and Fred Long", title = "Code Compaction For Parallel Architectures", journal = spe, year = "1990", volume = "20", number = "6", pages = "537--554", month = jun, annote = "Describes methods for exploiting instruction-level parallelism based on code movement between basic blocks and renaming. They describe their data structures and list their algorithms. Unfortunately the system seems to work only on toy problems." } @InProceedings{cooper85, author = "K. Cooper", title = "Analyzing aliases of reference formal parameters", booktitle = "Conference Record of the Twelfth ACM Symposium on Principles of Programming Languages", year = "1985" } @InProceedings{coutant86, author = "D. S. Coutant", title = "Retargetable High-Level alias Analysis", booktitle = "Conference Record of the Thirteenth ACM Symposium on Principles of Programming Languages", year = "1986" } @Book{stallings90, title = "Reduced Instruction Set Computers", publisher = "IEEE Computer Society Press", year = "1990", editor = "William Stallings", edition = "second", annote = "A collection of articles on RISCs, most of them pretty old (before '87) and/or low-level" } @Article{chaitin+81, author = "Gregory J. Chaitin and Marc A. Auslander and Ashok K. Chandra and John Cocke and Martin E. Hopkins and Peter W. Markstein", title = "Register Allocation via Coloring", journal = complang, year = "1981", volume = "6", number = "1", pages = "45--57", note = "Reprinted in \cite{stallings90}", annote = "The seminal paper on coloring register allocation. The spill code generation differs much from Chaitins later paper and reminds me of hierarchical graph coloring." } @InProceedings{shieh&papachristou89, author = "Jong-Jiann Shieh and Christos A. Papachristou", title = "On Reordering Instruction Streams for Pipelined Computers", crossref = "micro22", pages = "199--206", annote = "Another paper on basic block instruction scheduling." } @InProceedings{schwiegelshohn+89, author = "U. Schwiegelshohn and F. Gasperoni and K. Ebcio\u{g}lu", title = "On Optimal Loop Parallelization", crossref = "micro22", pages = "141--147", annote = "Proves that there are loops that cannot be parallelized optimally even with unlimited resources, because they could demand more resources in every iteration, exceeding any bound. Optimal parallelization means, that the program executes in the time given by its critical path length." } @InProceedings{nakatani&ebcioglu89, title = "{``Combining''} as a Compilation Technique for {VLIW} Architectures", author = "Toshio Nakatani and Kemal Ebcio\u{g}lu", crossref = "micro22", pages = "43--55", annote = "They reduce the path length by combining operations on immediate values. Combining is used in conjunction with percolation scheduling, software pipelining and loop unrolling on an interesting VLIW architecture. In this architecture every instruction consists of a decision tree with the operations on the edges and condition code tests on the nodes. In this setting combining results in a speedup of up to 18\%." } @Proceedings{micro22, booktitle = "$22^{\it nd}$ Annual International Workshop on Microprogramming and Microarchitecture (MICRO-22)", year = "1989" } @Book{kuck78, author = "David J. Kuck", title = "The Structure of Computers and Computations", publisher = "John Wiley \& Sons", year = "1978", volume = "1", annote = "A textbook on computer hardware and architecture. Contains some interesting things that are now reappearing (e.g. a chapter on tree-height reduction)." } @Article{cocke88, author = "John Cocke", title = "The Search for Performance in Scientific Processors", journal = cacm, year = "1988", volume = "31", number = "3", pages = "250--253", month = mar, note = "Turing Award Lecture", annote = "Contains among other things a description of the Advanced Computer System (ACS) project at IBM 1964--1968 (superscalar (1 branch, 3 integer, 2 fpadd, 1 fpmul, 2 memory), branch prediction, compiler support); History of the 801" } @MastersThesis{stuerzlinger89, author = "Wolfgang St{\"u}rzlinger", title = "{C-Compiler f\"ur den VIP-Prozessor}", school="{Technische Universit\"{a}t Wien}", year = "1989", } @InProceedings{beaty+90, author = "Steven Beaty and Gearold Johnson and Darrell Whitley", title = "Motivation and Framework for Using Genetic Algorithms for Microcode Compaction", crossref = "micro23", note = "Reprinted in: SIGmicro Newsletter, January 1991", pages = "117--124", annote = "Gives a good introduction in genetic algorithms and applies them to (local) microcode compaction: The chromosomes are the priority list of the instructions." } @InProceedings{nakatani&ebcioglu90, author = "Toshio Nakatani and Kemal Ebcio\u{g}lu", title = "Using a Lookahead Window in a Compaction-Based Parallelizing Compiler", crossref = "micro23", note = "Reprinted in: SIGmicro Newsletter, January 1991", pages = "57--68" } @MastersThesis{ambrosch93, author = "Wolfgang Ambrosch", title = "{Analyse und Vergleich von Registerallokationsalgorithmen}", school = "{Technische Universit\"{a}t Wien}", year = "1993" } @MastersThesis{beer93, author = "Felix Beer", title = "{Globale Optimierung}", school = "{Technische Universit\"{a}t Wien}", year = "1993" } @TechReport{larus&ball92, author = "James R. Larus and Thomas Ball", title = "Rewriting Executable Files to Measure Program Behavior", institution = "University of Wisconsin Computer Sciences", year = "1992", number = "1083", ftp = "primost.cs.wisc.edu", ftpfile = "pub/rewriting-tr.ps.Z", annote = "Describes the advantages of inserting instrumentation code after linking (not too convincing) and discusses properties of the executable file format, that cause or solve problems with this approach. Their method does not insert instrumentation code at every edge of the control flow graph and computes the missing information from the rest. The most heavily used edges can run at full speed." } @Article{diefendorff&allen92, author = "Keith Diefendorff and Michael Allen", title = "Organization of the {Motorola} 88110 Superscalar {RISC} Microprocessor", journal = ieeemicro, year = "1992", pages = "40--63", month = apr, annote = "The 88110 can issue two instruction per cycle using ten functional units (two integer); It has special graphics commands, an extra $32 \times 80$ register file and fast (3-cycle) floating point. The instructions are executed in-order, except for branches and stores. Speculative execution is supported through a history buffer." } @InProceedings{schuetz92, author = "Udo Sch{\"u}tz", title = "{Optimierung von Fadencode}", booktitle = "{FORTH-Tagung}", year = "1992", organization = "Forth Gesellschaft e.V.", address = "Rostock", annote = "Describes peephole optimization of Forth's threaded code. While I doubt that the impact on the run time justifies the effort, it may convince hackers to abstain from optimizing the code into a mess." } @Article{nicolau89, author = "Alexandru Nicolau", title = "Run-Time Disambiguation: Coping with Statically Unpredictable Dependencies", journal = ieeetc, year = "1989", volume = "38", number = "5", pages = "663--678", month = may, annote = "Static alias analysis often cannot determine whether two memory accesses refer to the same location. Conventionally the worst case is assumed, and the resulting dependency prohibits good schedules. Run-time Disambiguation assumes the best case, and checks this assumption at run-time. The application of this idea in the {Bulldog} trace scheduling compiler is discussed. The speedup achieved over {Bulldog} without RTD is up to~7. The code expansion is about proportional to the speedup, but can be reduced without too much effect on speed by not applying RTD to rarely-executed parts and by combining the routines handling the exceptional cases." } @InProceedings{blanck&krueger92, author = "Greg Blanck and Steve Krueger", title = "The {SuperSPARC} Microprocessor", booktitle = "COMPCON: Digest of Papers", year = "1992", pages = "136--141", OPTorganization = "IEEE", annote = "The SuperSPARC (aka Viking) is a superscalar SPARC implementation that can issue up to three instructions per cycle. Which instruction can issue together is determined by a set of 23 rules. Some specialties: results from a load are available in the next cycle; dependent integer instructions can be issued in the same cycle, even a load can depend on one integer instruction in the same cycle; ``hard'' instructions like SAVE issue alone." } @InProceedings{nicolau85, author = "Alexandru Nicolau", title = "Uniform Parallelism Exploitation in Ordinary Programs", booktitle = "1985 International Conference on Parallel Processing", year = "1985", pages = "614--618", annote = "Describes the basic transformation rules and the conceptual framework of Percolation Scheduling. However, no guiding rules etc. are detailed." } @Article{smith&pleszkun88, author = "James E. Smith and Andrew R. Pleszkun", title = "Implementing Precise Interrupts in Pipelined Processors", journal = ieeetc, year = "1988", volume = "37", number = "5", pages = "562--573", month = may, annote = "After defining precise interrupts this papaer describes several ways to achieve them in pipelined machines: Plain in-order completion is slow, because new instructions must wait longer for the results. To resolve the problem, the paper presents in-order completion with bypasses, history buffers and future files (a shadow register file that keeps the imprecise state). Stores should be issued immediately and buffered in the memory unit to avoid performance problems. A performance analysis is done (on a high-latency model) and extensions to virtual memory, cache memory, and vectors are discussed." } @Book{kane&heinrich92, author = "G. Kane and J. Heinrich", title = "{MIPS RISC} Architecture", publisher = "Prentice-Hall", year = "1992" } @Article{padua&wolfe86, author = "David A. Padua and Michael J. Wolfe", title = "Advanced Compiler Optimizations for Supercomputers", journal = cacm, year = "1986", volume = "29", number = "12", pages = "1184--1201", month = dec } @InProceedings{chaitin82, author = "G. J. Chaitin", title = "Register Allocation \& Spilling via Graph Coloring", crossref = "sigplan82", pages = "98--105" } @Book{zech84, author = "Ronald Zech", title = "{Die Programmiersprache FORTH}", publisher = "Franzis", year = "1984", address = "M{\"u}nchen", edition = "First", note = "In German" } @InProceedings{briggs+92, author = "Preston Briggs and Keith D. Cooper and Linda Torczon", title = "Rematerialization", crossref = "sigplan92", pages = "311--321", annote = "Some values are cheaper to recompute than to spill. This paper describes a framework for exploiting these benefits. It consists of using static single assignment form for analysis, splitting live ranges and modifications to register coalescing. The results are positive, but it is not clear how significant they are." } @InProceedings{proebsting&fischer92, author = "Todd A. Proebsting and Charles N. Fischer", title = "Probabilistic Register Allocation", crossref = "sigplan92", pages = "300--310", annote = "The probability that a value survives in a register during an execution path is used for deciding which value to hold in a register and which to spill. Empirical data is presented, but unfortunately no comparison with other approaches. The algorithm seems to be very slow." } @InProceedings{rau+92, author = "B. R. Rau and M. Lee and P. P. Tirumalai and M. S. Schlansker", title = "Register Allocation for Software Pipelined Loops", crossref = "sigplan92", pages = "283--299", annote = "The problem of register allocation in modulo scheduled loops is to allocate the registers in a way that minimizes register idle time. The constraints on allocations depend on the code generation strategy (hardware support, preconditioning vs.\ multiple loop exits etc.). Registers are heuristically allocated to lifetimes in a heuristically determined order. The best heuristics work very well and produce allocations at or near the lower bound. Therefore the authors recommend choosing a code generation strategy that minimizes the produced code." } @InProceedings{mueller&whalley92, author = "Frank Mueller and David B. Whalley", title = "Avoiding Unconditional Jumps by Code Replication", crossref = "sigplan92", pages = "322--330", annote = "Nearly all unconditional jumps can be eliminated by code replication. This expands compiled C code by about 50\%, but reduces the number of executed instructions and the cache work (miss ratio increases slightly). There are some nontrivial problems in this technique, which are solved heuristically in this paper." } @InProceedings{granlund&kenner92, author = "Torbj{\"o}rn Granlund and Richard Kenner", title = "Eliminating Branches using a Superoptimizer and the GNU C compiler", crossref = "sigplan92", pages = "341--352", annote = "A superoptimizer is used to generate optimal code fragments for use by the GNU C compiler. The fragments are for conditional expressions for the RS/6000. Their superoptimizer uses an interpreter instead of the target machine and is therefore machine-independent." } @InProceedings{brooks+92, author = "Gary Brooks and Gilbert J. Hansen and Steve Simmons", title = "A New Approach to Debugging Optimized Code", crossref = "sigplan92", pages = "1--11", annote = "Stepping through the code is performed by highlighting the active source code, in optimized execution order. Optimization is not made transparent. The paper describes a compiler-debugger interface for this purpose, but the description seems to be a little too specific (there's no explanation what is necessary for the approach and what's specific to the system)." } @InProceedings{ramsey&hanson92, author = "Norman Ramsey and David R. Hanson", title = "A Retargetable Debugger", crossref = "sigplan92", pages = "22--31", annote = "Postscript is used for communication between compiler and debugger." } @InProceedings{tjiang&hennessy92, author = "Steven W. K. Tjiang and John L. Hennessy", title = "Sharlit---A tool for building optimizers", crossref = "sigplan92", pages = "82--93", annote = "Sharlit is a tool for writing data flow analysers. It can work on simple one-instruction-one-node flow graphs. Flow graph simplification rules are used to reduce the run-time and space cost of the analyser." } @InProceedings{tan&lin92, author = "Jichang Tan and I-Peng Lin", title = "Compiling Data Flow Analysis of Logic Programs", crossref = "sigplan92", pages = "106--115", annote = "Data flow analysis is performed by compiling the Prolog program to WAM code and interpreting the WAM code in a different way, i.e.\ abstract interpretation at the WAM code level. If compilation time to WAM code is not counted, they get speedups of 14--575 times over the Aquarius Prolog compiler on small programs, but the compilation to the WAM takes as much time as the analysis on the Aquarius Prolog compiler." } @InProceedings{jaffar+92, author = "Joxan Jaffar and Spiro Michaylov and Peter J. Stuckey and Roland H. C. Yap", title = "An Abstract Machine for CLP($\cal R$)", crossref = "sigplan92", pages = "128--139" } @InProceedings{hendren+92sigplan, author = "Laurie J. Hendren and Joseph Hummel and Alexandru Nicolau", title = "Abstractions for Recursive Pointer Data Structures: Improving the Analysis and Transformation of Imperative Programs", crossref = "sigplan92", pages = "249--260", annote = "Analysis of data structure using pointers can be improved, if the data structure definitions contain information about how the pointers in them can be used. This paper presents such a notation (ADDS) and describes its formal properties and applications." } @InProceedings{diwan+92, author = "Amer Diwan and Eliot Moss and Richard Hudson", title = "Compiler Support for Garbage Collection in a Statically Typed Language", crossref = "sigplan92", pages = "273--282", annote = "The compiler generates a descriptor for every garbage collection point that enables the run-time system to modify the appropriate values for copying garbage collection. This includes derived values like differences between pointers that may arise due to optimizations." } @InProceedings{hoelzle+92, author = "Urs H{\"o}lzle and Craig Chambers and David Ungar", title = "Debugging Optimized Code with Dynamic Deoptimization", crossref = "sigplan92", pages = "32--43", annote = "The debugger for SELF hides the effects of optimization. This is achieved by not performing some optimizations (e.g. tail-call optimization), and restricting others to the areas between interrupt points (one per procedure call or loop iteration). On debugging the unoptimized (stack) state and methods are recovered lazily (only the active ones). Adding debugging information increases space usage by 2.2--3.3 times." } @InProceedings{proebsting92, author = "Todd A. Proebsting", title = "Simple and Efficient BURS Table Generation", crossref = "sigplan92", pages = "331--340", annote = "Describes an algorithm for generating tree parsing automata from tree grammars. The description is very detailed. Table compression techniques are used to avoid huge tables. The paper describes a compression technique that is both simpler and more effective than earlier techniques. The paper also discusses engineering issues in the implementation of the algorithm and gives an empirical comparison with previous work." } @Proceedings{sigplan92, key = "SIGPLAN '92", title = "SIGPLAN '92 Conference on Programming Language Design and Implementation", booktitle = "SIGPLAN '92 Conference on Programming Language Design and Implementation", year = "1992" } @Book{ting81, author = "C. H. Ting", title = "Systems Guide to fig-Forth", publisher = "Offete Enterprises, Inc.", year = "1981", address = "San Mateo, CA 94402", OPTedition = "First" } @InProceedings{lam&wilson92, author = "Monica S. Lam and Robert P. Wilson", title = "Limits of Control Flow on Parallelism", crossref = "isca92", pages = "46--57", annote = "Or rather: How control flow limits instruction-level parallelism. Several means for circumventing these limits are discussed: unidirectional speculative execution, control dependence analysis (treatment of code interrupted by e.g.\ an if-statement), and executing multiple flows of control. Machine models based on combinations of these techniques were simulated and empirical results on non-numeric benchmarks are listed. The simulations relax nearly all scheduling constraints except true data dependences and model-related control-flow constraints. The model-related dependences are not entirely realistic, e.g. no reordering of branches and only one branch/cycle without multiple flow of control. The parallelism of the non-numeric benchmarks on the SP-CD-MF model is limited to 18-402 (HM 39.6)." } @InProceedings{fernandes&barbosa92, author = "Edil S. T. Fernandes and Fernando M. B. Barbosa", title = "Effects of Building Blocks on the Performance of Super-Scalar Architectures", crossref = "isca92", pages = "36--45", annote = "De mortuis ..." } @InProceedings{franklin&sohi92, author = "Manoj Franklin and Gurindar S. Sohi", title = "The Expandable Split Window Paradigm For Exploiting Fine-Grain Parallelism", crossref = "isca92", pages = "58--67", annote = "This new architecture is situated between superscalar and shared-memory machine. The guiding principle of this machine is decentralization, in order to be expandable (scalable). It consists of several {\em stages}, which are nearly full processors. Each stage processes a small chunk of code ({\em basic window}) at a time, e.g. a basic block or an if-statement. The stages are organized in a queue, processing (probably) consecutive windows. The basic windows are executed in a pipelined fashion: The results needed by later basic windows are passed along, so they can execute concurrently with earlier basic windows. An architecture along this framework has been simulated running the SPEC benchmarks and the results on ordinary code are comparable to other superscalar processors. With a bit of scheduling even larger parallelism can be achieved." } @InProceedings{degloria&farabischi92, author = "De Gloria, Alessandro and Paolo Faraboschi", title = "Instruction-level Parallelism in {Prolog}: Analysis and Architectural Support", crossref = "isca92", pages = "224--233", annote = "Traces of code produced by the Aquarius Prolog compiler displays the following properties: 32\% of the executed instructions access memory. Branch prediction accuracy is 86\%. The average basic block length is 6~instructions. Using trace scheduling increases the size of the scheduled code chunks to 11--12 and increases parallelism by 30\% over basic block scheduling. The overall speedup on a multi-ALU machine over a sequential machine is up to 1.95, with a two-ALU machine achieving a speedup of 1.89. A prototype VLIW processor based on this work delivers 2.1~MLIPS (30~MHz)." } @InProceedings{lenoski+92, author = "Daniel Lenoski and James Laudon and Truman Joe and David Nakahira and Luis Stevens and Anoop Gupta and John Hennessy", title = "The {DASH} Prototype: Implementation and Performance", crossref = "isca92", pages = "92--103", annote = "DASH is a larg-scale shared-memory multiprocessor. The final goal is a 64-processor machine. The current 16-processor version shows speedup factors of 4--14 on parallel applications. DASH uses 4-processor clusters; accesses to otrher clusters are served by a directory mechanism." } @InProceedings{intrater&spillinger92, author = "Gideon Intrater and Ilan Spillinger", title = "Performance Evaluation of a Decoded Instruction Cache for Variable Instruction-Length Computers", crossref = "isca92", pages = "106--113", annote = "Decoded instruction caches for variable instruction-length machines differ somewhat in their behaviour from usual caches, because the addresses of consecutive instructions cannot be simply mapped into consecutive cache lines. The paper discusses and evaluates various mapping schemes based on suppressing low-order bits, associativity and line size. Associativity is more important for decode instruction caches. On the other hand, increasing line size is not as useful, because only instructions at the start of a line can be found in decoded instruction caches." } @InProceedings{hirita+92, author = "Hiroaki Hirita and Kozo Kimura and Satoshi Nagamine and Yoshiyuki Mochizuki and Akio Nishimura and Yoshimori Nakase and Teiji Nishizawa", title = "An Elementray Processor Architecture with Simultaneous Instruction Issuing from Multiple Threads", crossref = "isca92", pages = "136--145", annote = "A mixture of superscalar and shard-memory machine: The machine has several functional units, several register files and sequencing units. The functional units are allocated to threads on a per-cycle basis. Conflicts are being resolved using a priority scheme, with the priorities of the threads changing very often. Inter-thread communication is achieved through queue registers. Iterations of a loop can be executed in parallel on different threads. This architecture achieves a speedup of up to 5.79 using 8 threads and two load/store units on a ray-tracing program. Trading threads for superscalar execution reduces the performance of this application." } @InProceedings{yeh&patt92, author = "Tse-Yu Yeh and Yale N. Patt", title = "Alternative Implementations of Two-Level Adaptive Branch Prediction", crossref = "isca92", pages = "124--134", annote = "Two-level Adaptive Branch Prediction works by keeping a history of the last $k$ taken/not-taken decisions and using conventional (e.g. two-bit) techniques for predicting the behaviour following the history pattern. Both the history and the pattern can be maintained on a global or per-branch basis. All variations result in similar performance when given enough hardware, but per-branch history (12 bits) and global patterns are least expensive. Two-Level Adaptive Branch Prediction achieves 97\% prediction accuracy for the SPEC '89 benchmarks." } @InProceedings{najjar+92, author = "Walid A. Najjar and W. Marcus Miller and A. P. Wim B{\"o}hm", title = "An Analysis of Loop Latency in Dataflow Execution", crossref = "isca92", pages = "352--360" } @InProceedings{kurian+92, author = "Memory Latency Effects in Decoupled Architectures with a Single Data Memory Module", title = "Lizyamma Kurian and Paul T. Hulina and Lee D. Coraor", crossref = "isca92", pages = "236--245", annote = "Machines with an independent memory access processor are faster than uniprocessors with cache on numeric code, if the memory latency is low. If the latency is high, the result depends on the locality of the accesses." } @InProceedings{austin&sohi92, author = "Todd M. Austin and Gurindar S. Sohi", title = "Dynamic Dependency Analysis of Ordinary Programs", crossref = "isca92", pages = "342--351", annote = "A \cite{wall91asplos}-type study. This one emphasizes methods for building the dependence graph and what can be measured with them. It also presents parallelism profiles, i.e. parallelism/time diagrams. Other interesting points are varying the renaming capabilities and the instruction window size." } @InProceedings{olukotun+92, author = "Kunle Olukotun and Trevor Mudge and Richard Brown", title = "Performance Optimization of Pipelined Primary Caches", crossref = "isca92", pages = "181--190", annote = "An example of how to optimize cache parameters (size, pipeline depth etc.) for high performance." } @Proceedings{isca92, key = "ISCA-19", title = "The $19^{th}$ Annual International Symposium on Computer Architecture (ISCA)", booktitle = "The $19^{th}$ Annual International Symposium on Computer Architecture (ISCA)", year = "1992" } @Article{tevet89, author = "Adin Tevet", title = "Symbolic Stack Addressing", journal = jfar, year = "1989", volume = "5", number = "3", pages = "365--379", url = "http://soton.mpeforth.com/flag/jfar/vol5/no3/article2.pdf", annote = "A local variable mechanism that uses the data stack for storage. The variables are accessed by {\tt PICK} and {\tt POST} (its opposite), which means that the compiler must keep track of the stack depth. Includes source code for 8086 F83." } @InCollection{glass83, author = "Harvey Glass", title = "Towards a More Writable {Forth} Syntax", crossref = "ouverson86", chapter = "21", pages = "169--181", note = "Reprinted from {\em Proceedings of the 1983 Rochester Forth Applications Conference}" } @InCollection{perry86, author = {Michael A. Perry}, title = {A 68000 {Forth} Assembler}, crossref = {ouverson86}, chapter = {23}, pages = {193--201} } @Book{ouverson86, booktitle = {Dr.\ Dobb's Toolbook of Forth}, publisher = {M\&T Books}, year = {1986}, editor = {Marlin Ouverson}, address = {Redwood City, CA 94063} } @Article{almy86, author = "Thomas Almy", title = "Compiling {Forth} for Performance", journal = jfar, year = "1986", volume = "4", number = "3", pages = "379--388", annote = "A batch Forth compiler for the 8086 and the Z80. It produces machine code in executable files. It uses peephole optimization and keeps up to two values from the top of the stack in registers." } @Article{belinfante87, author = "Johan G.F. Belinfante", title = "{S/K/ID}: Combinators in {F}orth", journal = jfar, year = "1987", volume = "4", number = "4", pages = "555--580" } @Article{rose86, author = "Anthony Rose", title = "Design of a Fast 68000-Based Subroutine-Threaded {Forth} with Inline Code \& an Optimizer", journal = jfar, year = "1986", volume = "4", number = "2", pages = "285--288", note = "1986 Rochester Forth Conference", annote = "Inlines everything below a critical size and peephole optimizes the code." } @InProceedings{hayes+87, author = "John R. Hayes and Martin E. Fraeman and Robert L. Williams and Thomas Zaremba", title = "An Architecture for the Direct Execution of the {Forth} Programming Language", crossref = "asplos87", pages = "42--48", journal = sigplan, OPTmonth = oct, OPTvolume = "22", OPTnumber = "10", annote = "The 32-bit FRISC processor features direct access to the 4 top elements of both stacks, 4 user registers, and the usual Forth processor features. It has two 16-element stack caches, that are maintained via exceptions." } @Proceedings{asplos87, key = "ASPLOS-II", title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-II)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-II)", year = "1987", } @InProceedings{hasegawa&shigei85, author = "Makoto Hasekawa and Yoshiharu Shigei", title = "High-Speed Top-of-Stack Scheme for {VLSI} Processor: A Management Algorithm and Its Analysis", pages = "48--54", booktitle = "International Symposium on Computer Archictecture (ISCA)", year = "1985" } @Article{wall92, author = "David W. Wall", title = "Experience with a Software-Defined Machine Architecture", journal = toplas, year = "1992", volume = "14", number = "3", pages = "299--338", month = jul, annote = "Discusses an assembler/linker system that does not present the actual instruction set to the compiler/user. The assembler and linker translate this IL into machine code and optimize it. The bulk of the paper discusses the optimizations performed in the linker: interprocedural register allocation and instruction scheduling. The instruction scheduler also performs a little speculative execution." } @Article{jaffar+92, author = "Joxan Jaffar and Spiro Michaylov and Peter J. Stuckey and Roland H. C. Yap", title = "The {CLP($\cal R$)} Language and System", journal = toplas, year = "1992", volume = "14", number = "3", pages = "339--395", month = jul } @Book{appel92, author = "Andrew W. Appel", title = "Compiling with Continuations", publisher = "Cambridge University Press", year = "1992" } @InProceedings{freudenberger&ruttenberg91, author = "Stefan M. Freudenberger and John C. Ruttenberg", title = "Phase Ordering of Register Allocation and Instruction Scheduling", crossref = "codegen91", pages = "146--170", annote = "They use a technique that reminds me of coagulation \cite{morris91} to solve the instruction scheduling/register allocation phase ordering problem. A trace scheduler selects traces in decreasing order of (expected) execution frequency and then passes the traces to the instruction scheduler which also performs trace-local register allocation. The allocation decisions made by the scheduler on earlier (i.e. more frequent) traces have to be respected later. In contrast to coagulation there is no register renaming, when scheduled parts merge. They avoid repair code by preferencing values into the same register. The results on their architecture (a Multiflow Trace/300 VLIW machine) are good." } @InProceedings{emmelmann91, author = "Helmut Emmelmann", title = "Code Selection by Regularly Controlled Term Rewriting", crossref = "codegen91", pages = "3--29", annote = "Non-deterministic term rewriting is proposed as a method for code selection. Instead of generating the code through side effects of a tree parser, the intermediate language is rewritten into the machine code. This enables the application of multiple rewrite rules, which makes possible a better factoring of the code selection description. The paper also describes an algorithm for processing such descriptions and presents a few results." } @InProceedings{giegerich91, author = "Robert Giegerich", title = "Considerate Code Selection", crossref = "codegen91", pages = "51--65", annote = "This approach promises to solve phase ordering problems involving code selection. All possible code selections are generated and later passes decide which one is best. To make this approach feasible, shared forests are used." } @InProceedings{boyland&emmelmann91, author = {John Boyland and Helmut Emmelmann}, title = {Discussion: Code Generator Specification Techniques (Summary)}, crossref = "codegen91", pages = "66-69" } @Book{codegen91, booktitle = "Code Generation --- Concepts, Tools, Techniques", year = "1991", OPTaddress = "Schlo{"s} Dagstuhl", publisher = "Springer", editor = "Robert Giegerich and Susan L. Graham", series = "Workshops in Computing", } @unpublished(haas92, author="Mike Haas", title="Re: Addressable Stacks?", note="Usenet news group comp.lang.forth, message {``BprEnu.7AH@starnine.com''}", month=jun, year="1992", annote="JForth V3.0 keeps up to 5 values in registers." ) @InProceedings{chambers&ungar89, author = "Craig Chambers and David Ungar", title = "Customization: Optimizing Compiler Technology for {{\sc {Self}}}, a Dynamically-Typed Object-Oriented Programming Language", booktitle = "SIGPLAN '89 Conference on Programming Language Design and Implementation", year = "1989", pages = "146--160" } @Book{ungar87, author = "David Ungar", title = "The Design and Evaluation of a High-Performance Smalltalk System", publisher = "MIT Press", year = "1987" } @InProceedings{krall&berger92, author = "Andreas Krall and Thomas Berger", title = "Fast {Prolog} with a {VAM$_{1p}$} based {Prolog} Compiler", crossref = "plilp92", pages = "245--259" } @Book{plilp92, booktitle = "Programming Language Implementation and Logic Programming (PLILP '92)", publisher = "Springer LNCS~631", year = "1992" } @Article{tanenbaum+83, author = "Andrew S. Tanenbaum and Hans van Staveren and E. G. Keizer and Johan W. Stevenson", title = "A Practical Tool Kit for Making Portable Compilers", journal = cacm, year = "1983", volume = "26", number = "9", pages = "654--660", month = sep, annote = "Describes the Amsterdam Compiler Kit and all its phases." } @Article{cytron+91, author = "Ron Cytron and Jeanne Ferrante and Barry K. Rosen and Mark N. Wegman and F. Kenneth Zadeck", title = "Efficiently Computing Static Single Assignment form and the Control Dependence Graph", journal = toplas, year = "1991", volume = "13", number = "4", pages = "451--490", month = oct } @InProceedings{koopman92, author = "Koopman, Jr., Philip J.", title = "A Preliminary Exploration of Optimized Stack Code Generation", booktitle = "1992 Rochester Forth Conference", year = "1992", abstract = "This paper presents an experimental code generator that performs intra-block stack scheduling for a stack-based execution model. For small test programs, 91\% to 100\% of redundant local variable accesses were eliminated using this compiler. Compiled intra-block stack scheduling and hand-performed global stack scheduling show that significant opportunities exist to keep temporary variable values on the expression evaluation stack when compiling conventional languages." } @Article{chang+92, author = "Pohua P. Chang and Scott A. Mahlke and William Y. Chen and {Wen-mei} W. Hwu", title = "Profile-guided Automatic Inline Expansion for {C} Programs", journal = spe, year = "1992", volume = "22", number = "5", pages = "349--369", month = may } @PhdThesis{briggs92, author = "Preston Briggs", title = "Register Allocation via Graph Coloring", school = "Rice University", year = "1992", address = "Houston" } @InProceedings{hendren+92cc, author = "Laurie J. Hendren and Guang R. Gao and Erik R. Altman and Chandrika Mukerji", title = "A Register Allocation Framework Based on Hierarchical Cyclic Interval Graphs", crossref = "cc92", pages = "176--191", annote = "A new register allocation algorithm. It works well for inner loops and can be generalized for more complex control structures. It's performance in these cases is not yet clear." } @InProceedings{griesemer92, author = "Robert Griesemer", title = "Scheduling Instructions by Direct Placement", crossref = "cc92", pages = "229--235", annote = "This scheduling algorithm tries to minimize compile time. It does not build a data dependence graph and it does not compute information on this graph in extra passes. Instead, it takes the instruction as generated by code selection, and finds the first idle cycle after their earliest execution time (EET) and places the instruction there. This produces the same schedule as list scheduling with EET as primary heuristic and original order a tie-breaker." } @InProceedings{emmelmann92, author = "Helmut Emmelmann", title = "Testing Completeness of Code Selector Specifications", crossref = "cc92", pages = "163--175" } @Proceedings{cc92, booktitle = "Compiler Construction (CC'92)", title = "Compiler Construction (CC'92)", key = "CC'92", year = "1992", publisher = "Springer LNCS~641", address = "Paderborn", } @InProceedings{rogers&li92, author = "Anne Rogers and Kai Li", title = "Software Support for Speculative Loads", crossref = "asplos92", pages = "38--50", annote = "Speculative loads bypass the cache, are scoreboarded, and set a poison bit associated with the result register, if the load causes an exception. When the loaded value is used, poison causes an exception." } @InProceedings{mahlke+92, author = "Scott A. Mahlke and William Y. Chen and {Wen-mei} W. Hwu and B. Ramakrishna Rau and Michael S. Schlansker", title = "Sentinel Scheduling for {VLIW} and Superscalar Processors", crossref = "asplos92", pages = "238--247", annote = "Attacks the problem of speculatively executing trapping instructions. An exception generated by a speculative instruction is noted in a tag associated with the result register. These tags are propagated until checked by a sentinel instruction, which then traps. Recovery is performed by restarting the code from the original exception-causing instruction. Of course this poses heavy restrictions on the register allocator. Also, nothing ensures that the exceptions are taken in the correct order." } @InProceedings{fisher&freudenberger92, author = "Joseph A. Fisher and Stefan M. Freudenberger", title = "Predicting Conditional Branch Directions From Previous Runs of a Program", crossref = "asplos92", pages = "85--95", annote = "``Instructions per mispredicted branch'' is proposed as a more meaningful measure than the correct prediction percentage. Programs from the SPEC89 suite and a few others are measured with different data sets and static branch prediction. Integer programs (and spice2g6) have one misprediction for every 35--170 instructions, numeric software has 250-7500 instructions/misprediction. Using the wrong data set for prediction often halves the prediction accuracy, and reduces it to 12\% for spice2g6. These problems can usually be avoided by using several different data sets." } @InProceedings{smith+92, author = "Michael D. Smith and Mark Horowitz and Monica S. Lam", title = "Efficient Superscalar Performance Through Boosting", crossref = "asplos92", pages = "248--259", annote = "An extension of the work of \cite{smith+90}. Recovery of state at exceptions is now done by compiler-generated code. The global scheduling scheme they use is a variant of trace scheduling, used after register allocation. They have reduced the hardware cost of their scheme: One level of backup hardware suffices for several levels of branch prediction. This requires a more intelligent compiler, and restricts scheduling freedom slightly. They give experimental data for a simulated degree-2 superscalar machine. They achieve speedups of 1.5 over a scalar machine and 1.2 over nonspeculative execution." } @InProceedings{andrews&sand92, author = "Kristy Andrews and Duane Sand", title = "Migrating a {CISC} Computer Family onto {RISC} via Object Code Translation", crossref = "asplos92", pages = "213--222", annote = "They use binary translation for migrating existing Tandem Nonstop (TNS) code to the new MIPS-based Tandems. The TNS is a mixture of stack machine and register machine (registers can be addressed relative to a register stack pointer (RP) or absolutely); one of the challenges of the translation is predicting the value of RP at every point in the program. If the prediction is wrong, the translated program drops into interpretive mode (for a time). The paper emphasizes, that the resulting translated code can be debugged at the original machine level. The resulting code on an R3000 is about 5 times larger and about 3 times faster than the original code on a CLX~800, a machine based on similar technology." } @Proceedings{asplos92, key = "ASPLOS-V", title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-V)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-V)", year = "1992", } @InProceedings{robertson92, author = {Alan M. Robertson}, title = {A 448 Byte Forth Multitasking Kernel}, crossref = euroforth92, pages = {55--59} } @InProceedings{charlton91, author = {Gordon Charlton}, title = {{FOSM}, a {FOrth String Matcher}}, booktitle = {EuroForml '91 proceedings}, year = {1991} } @InProceedings{charlton92, author = {Gordon Charlton}, title = {{FOSM}, a {FOrth String Matcher}, continued}, crossref = {euroforth92}, pages = {113--122} } @Proceedings{euroforth92, key = "EuroForth~'92", title = "EuroForth~'92", booktitle = "EuroForth~'92", year = "1992", organization = "MicroProcessor Engineering", address = "Southampton, England" } @InProceedings{jakeman96, author = {C.M. Jakeman}, title = {Portable Back-tracking In ANS Forth}, booktitle = {FORML '96 Proceedings}, year = 1996, url = {ftp://ftp.taygeta.com/pub/Forth/Applications/fosm1v1.zip} } @Misc{beusterien92, author = "Paul Beusterien", howpublished = "Personal communication", year = "1992", annote = "The Harris C compiler performs register reallocation during instruction scheduling to reduce anti- and output dependences." } @Misc{briggs92email, author = "Preston Briggs", howpublished = "Personal communication", year = "1992", annote = "Register selection does not pay off for register allocation alone" } @InProceedings{auslander&hopkins82, author = "Marc Auslander and Martin Hopkins", title = "An Overview of the {PL.8} Compiler", crossref = "sigplan82", pages = "22-31" } @Proceedings{sigplan82, key = "SIGPLAN~'82", title = "SIGPLAN~'82 Symposium on Compiler Construction", booktitle = "SIGPLAN~'82 Symposium on Compiler Construction", year = "1982" } @InProceedings{vanhentenryck&deville91, author = "Van Hentenryck, Pascal and Yves Deville", title = "The Cardinality Operator: A new Logical Connective for Constraint Logic Programming", crossref = "iclp91", pages = "745--759", annote = "The cardinality operator is a very powerful metaconstraint. Basically, it takes a number of constraints, an upper and lower bound, and it succeeds if the number of successful constraints lies between the lower and the upper bound. This contains the disjunction, negation and other connectives of constraints." } @InProceedings{aggoun&beldiceanu91, author = "Abderrahmane Aggoun and Nicolas Beldiceanu", title = "Overview of the {CHIP} Compiler System", crossref = "iclp91", pages = "775--789" } @Proceedings{iclp91, key = "ICLP-8", title = "Eighth International Conference on Logic Programming (ICLP-8)", booktitle = "Eighth International Conference on Logic Programming (ICLP-8)", year = "1991", publisher = "MIT Press" } @InProceedings{taylor90, author = "Andrew Taylor", title = "{LIPS} on a {MIPS}", crossref = "iclp90", pages = "174--185" } @Proceedings{iclp90, key = "ICLP-7", title = "Seventh International Conference on Logic Programming (ICLP-7)", booktitle = "Seventh International Conference on Logic Programming (ICLP-7)", year = "1990", publisher = "MIT Press" } @inproceedings (dincbas+88ecai, author = "Dincbas, M. and Simonis, H. and Van Hentenryck, P.", title = "{Solving the Car Sequencing Problem in Constraint Logic Programming}", booktitle = "European Conference on Artificial Intelligence (ECAI-88)", address = "M{\"u}nchen", year = "1988" ) @Article{paysan91, author = "Bernd Paysan", title = "{Ein optimierender Forth-Compiler}", journal = "Vierte Dimension", year = "1991", volume = "7", number = "3", pages = "22--25", month = sep } @InCollection{bundy+84, author = "Alan Bundy and Ben Du Bolay and Jim Howe and Gordon Plotkin", title = "How to Get a {Ph.D.} in {AI}", booktitle = "Artificial Intelligence---Tools, Techniques, and Applications", chapter = "5", publisher = "Harper\&Row", year = "1984", editor = "Tim O'Shea and Marc Eisenstadt", pages = "139--154", address = "New York" } @Article{march91, author = "Salvatore T. March", title = "Editorial Policy", journal = "ACM Computing Surveys", year = "1991", volume = "23", number = "2", pages = "133--141", month = jun } @PhdThesis{smith92, author = "Michael David Smith", title = "Support for Speculative Execution in High-Performance Processors", school = "Stanford University", year = "1992", annote = "The main improvement over \cite{smith+92} is the description of the global instruction scheduling algorithm. It is based on trace scheduling, but tries to minimize compensation code. It also does a bit of scheduling across loop back-edges. He also generalizes boosting to ``opportunistic instruction scheduling'', but does not explain it well." } @InProceedings{chen+91, author = "William Y. Chen and Scott A. Mahlke and Pohua P. Chang and {Wen-mei} W. Hwu", title = "Data Access Microarchitectures for Superscalar Processors with Compiler-Assisted Data Prefetching", crossref = "micro24", pages = "69--73", annote = "A small cache (1K) with a prefetch buffer (32 entries) is better than a larger cache (2K) for programs with compiler-generated prefetch code. They also give data on loads that depend on other loads (making prefetching ineffective): 20--50\% of the loads are restricted by that in the analysed traces." } @InProceedings{su&wang91, author = "Bogong Su and Jian Wang", title = "{GURPR*}: A New Global Software Pipelining Algorithm", crossref = "micro24", pages = "212--216", annote = "A software pipelining algorithm for loops with embedded conditionals, that produces smaller and faster code than GURPR and sometimes much smaller (and slightly smaller) code than perfect pipelining." } @Proceedings{micro24, key = "MICRO-24", title = "24th International Symposium on Microarchitecture (MICRO-24)", booktitle = "24th International Symposium on Microarchitecture (MICRO-24)", year = "1991" } @InProceedings{sweany&beaty90, author = "Philip Sweany and Steven Beaty", title = "Post-Compaction Register Assignment in a Retargetable Compiler", crossref = "micro23", pages = "107--116", annote = "Discusses why prepass scheduling is usually preferable. For some architectures prepass scheduling means that scheduling must be repeated for every spill code insertion pass." } @Proceedings{micro23, key = "MICRO-23", title = "23rd Annual Workshop on Microprogramming and Microarchitecture (MICRO-23)", booktitle = "23rd Annual Workshop on Microprogramming and Microarchitecture (MICRO-23)", year = "1990" } @TechReport{fruehwirth92, author = "Thom Fr{\"u}hwirth", title = "Constraint Simplification Rules", institution = "ECRC", year = "1992", number = "ECRC-92-18?" } @InProceedings{leprovost&wallace92, author= {Le Provost, Thierry and Wallace, Mark}, title= {Domain Independent Propagation}, pages= {1004--1011}, crossref= {FGCS92}, annote = "Generalizes the lookahead mechanism of finite domains to arbitrary domains: finit domains are generalized to {\em basic formulae} expressible in the underlying domain. The lookahead mechanism is generalised to {\em generalized propagation} where a basic formula that represents the solutions to the constraint as close as possible is produced. The concept is demonstrated by applying it to ordinary Prolog. The basic formulae are ordinary Prolog bindings, which (in contrast to finite domains) can express equality of variables." } @Proceedings{FGCS92, title = {Proceedings of the International Conference on Fifth Generation Computer Systems}, booktitle = {Proceedings of the International Conference on Fifth Generation Computer Systems}, year = {1992}, publisher = {Association for Computing Machinery}, address = {ICOT, Japan}, } @TechReport{leprovost&wallace92tr, author = "Le Provost, Thierry and Mark Wallace", title = "Generalised Constraint Propagation Over the CLP Scheme", institution = "ECRC", year = "1992", number = "ECRC-92-1", note = "To appear in the Journal of Logic Programming", annote = "Discusses generalised propagation in depth. In particular, the paper gives a theoretical treatment (unfortunately only for satisfaction-complete domains) and discusses the implementation. It presents topological branch and bound as a method for implementing generalised propagation: It terminates the computation of a propagation goal as soon as it cannot generate new information." } @Article{sidebottom&havens92, author = "Gregory Sidebottom and William S. Havens", title = "Hierarchical Arc Consistency for Disjoint Real Intervals in Constraint Logic Programming", journal = "Computational Intelligence", year = "1992", volume = "8", number = "4", pages = "601--623" } @InProceedings{theobald+92, author = "Kevin B. Theobald and Guang R. Gao and Laurie J. Hendren", title = "On the limits of Program Parallelism and its Smoothability", crossref = "micro25", pages = "10--19", annote = "Another limits study. Interesting new results: Memory renaming is very important, finite instruction windows limit parallelism to much less than window size, and instruction-level parallelism is well smoothable." } @InProceedings{vajapeyam&hsu92, author = "Sriram Vajapeyam and Wei-Chung Hsu", title = "On the Instruction-Level Characteristics of Scalar Code in Highly-Vectorized Scientific Applications", crossref = "micro25", pages = "20--28", annote = "An empirical analysis on the CRAY Y-MP. It's significance evades me." } @InProceedings{chen+92, author = "Chien-Ming Chen and Yunn-Yen Chen and Chung-Ta King", title = "Branch Merging for Effective Exploitation of Instruction-Level Parallelism", crossref = "micro25", pages = "37--40" } @InProceedings{degloria+92, author = "Alessandro De Gloria and Paolo Faraboschi and Mauro Olivieri", title = "A Non-Deterministic Scheduler for a Software Pipelining Compiler", crossref = "micro25", pages = "41--44", annote = "Presents a mapping of the software pipelining problem to a Boltzmann Machine (similar to simulated annealing). The resuling algorithm has complexity $O(n^2)$." } @InProceedings{chang&lang&shang92, author = "Meng-Chu Chang and Feipei Lang and Rung-ji Shang", title = "Exploiting Instruction-Level Parallelism with the Conjugate Register File Scheme", crossref = "micro25", pages = "29--32", annote = "Introduces a more programmable variation of Smith's shadow structures. The more interesting part of the paper describes the register allocator: A scheduling pass determines the costs of introducing antidependences, which are then used by the register allocator as scheduling-conflict graph. The scheduling proper is done in a postpass strategy." } @InProceedings{mahlke+92micro, author = "Scott A. Mahlke and David C. Lin and William Y. Chen and Richard E. Hank and Roger A. Bringmann", title = "Effective Compiler Support for Predicated Execution Using the Hyperblock", crossref = "micro25", pages = "45--54", annote = "Discusses lots of issues involved in if-conversion, i.e. elimination of conditional branches by using predicated execution." } @InProceedings{moon&ebcioglu92, author = "Soo-Mook Moon and Kemal Ebcio\u{g}lu", title = "An Efficient Resource-Constrained Global Scheduling Technique for Superscalar and {VLIW} processors", crossref = "micro25", pages = "55--71", annote = "Discusses global scheduling in great detail. The algorithm also includes register renaming and a limited form of combining. The paper distinguishes global scheduling and software pipelining; it uses enhanced pipeline scheduling for software pipelining. The global scheduling algorithm is based on list-scheduling, like most others. The main heuristic is degree of speculativeness, which does not need profiling information. In spite of this, the results are impressive: Average speedup 4.8 for a VLIW machine with 16 simultaneous operations and 16-way branching, with cache 3.7. The code expands by a factor of 2.1, the scheduling time is less than the rest of the compile time." } @InProceedings{huang+92, author = "Shih-Hsu Huang and Cheng-Tsung Hwang and Yu-Chin Hsu and Yen-Jen Oyang", title = "A New Approach to Schedule Operations Across Nested-ifs and Nested-loops", crossref = "micro25", pages = "268--271", annote = "A global scheduling algorithm that first pushes operations (not branches) down- and inwards and then pushes as many upwards as is possible without making the basic blocks longer." } @InProceedings{vegdahl92, author = "Steven R. Vegdahl", title = "A Dynamic-Programming Technique for Compacting Loops", crossref = "micro25", pages = "180--188", annote = "Extends an algorithm for optimal basic block scheduling to loop scheduling. The algorithm builds a graph, where the nodes represent the set of already executed instructions and the edges represent groups executed in one cycle. An optimal solution is found by using a shortest-path algorithm on the graph (shortest-cycle for loops). The algorithm can handle loops with up to 20--30 instructions." } @InProceedings{sweany&beaty92, author = "Philip H. Sweany and Steven J. Beaty", title = "Dominator-Path Scheduling --- A Global Scheduling Method", crossref = "micro25", pages = "260--263", annote = "This is another scheduling algorithm similar to trace scheduling. It's selling point is that it does not need any code replication. Their prototype implementation achieved a speedup of 8.3\% on an RS6000." } @InProceedings{kiyohara&gyllenhaal92, author = "Tokuzo Kiyohara and John C. Gyllenhaal", title = "Code Scheduling for {VLIW}/Superscalar Processors with Limited Register Files", crossref = "micro25", pages = "197--201", annote = "Solves the scheduling/register allocation phase ordering problem for unrolled loops by adding a bias to the priority function that tends to discourage overlap of iterations. The paper contains much empirical data. The technique proves to be quite effective." } @InProceedings{warter+92, author = "Nancy J. Warter and Grant E. Haab and Krishna Subramanian and John W. Bockhaus", title = "Enhanced Modulo Scheduling for Loops with Conditional Branches", crossref = "micro25", pages = "170--179", annote = "Enhances modulo scheduling to loops with conditional branches. In contrast to hierarchical reduction, which preschedules the if-statements, enhanced modulo scheduling schedules all instructions at the same time and therefore avoids phase ordering problems. The new method performs 18\% better than hierarchical reduction for issue rates of 2--8. Since the target machine can only do one conditional branch/cycle, it does not as well as predicated execution for high issue rates." } @InProceedings{allen+92, author = "V. H. Allen and J. Janardhan and R.M. Lee and M. Srinivas", title = "Enhanced Region Scheduling on a Program Dependence Graph", crossref = "micro25", pages = "72--80", annote = "Enhanced region scheduling performs four transformations, using the PDG \comment{an AST-like representation} as representation. Code motion is performed to redistribute parallelism. Software pipelining increases parallelism within loops, loop peeling increases parallelism outside loops. Peephole compaction fine-tunes the parallelism. Enhanced region scheduling achieves 62.6\% speedup over sequential execution and 29.5\% speedup over (I think) basic block scheduling on a degree 7 machine." } @InProceedings{capitanio+92, author = "Andrea Capitanio and Nikil Dutt and Alexandru Nicolau", title = "Partitioned Register Files for {VLIW}s: A Preliminary Analysis of Tradeoffs", crossref = "micro25", pages = "292--300", annote = "Tries to solve the problem of VLIWs hunger for register ports: A simple VLIW model with partitioned register files is proposed, the code is partitioned using a search strategy (a fast version of simulated annealing), cross-partition move instructions are inserted, and the code is rescheduled. The empirical analysis considers machines with up to 8 functional units, 16 read ports and 4 partitions. While partitioning hurts when additional read ports are free, it increases performance when the cycle time depends on the number of read ports logarithmically." } @Proceedings{micro25, key = "MICRO-25", title = "25th Annual International Symposium on Microarchitecture (MICRO-25)", booktitle = "25th Annual International Symposium on Microarchitecture (MICRO-25)", year = "1992" } @InProceedings{rather+93, author = "Elizabeth D. Rather and Donald R. Colburn and Charles H. Moore", title = "The Evolution of {Forth}", crossref = "hopl2preprints", pages = "177--199" } @InProceedings{stroustroup93, author = "Bjarne Stroustroup", title = "A History of C++: 1979--1991", crossref = "hopl2preprints", pages = "271--297", annote = "Contains a very interesting observation about the interaction of typechecking and programmer behaviour (Section 15.2.4.3): ``As programmers learned C with Classes or C++, they lost the ability to quickly find the ``silly errors'' that creep into C programs through the lack of checking. Further, they failed to take the precautions against such silly errors that good C programmers take as a matter of course. After all, ``such errors don't happen in C with Classes.'' Thus, as the frequency of run-time errors caused by uncaught argument type errors goes down, their seriousness and the time needed to find them goes up.''" } @InProceedings{rather+96, author = "Elizabeth D. Rather and Donald R. Colburn and Charles H. Moore", title = "The Evolution of {Forth}", crossref = "hopl2", pages = "625--658", url = "http://www.forth.com/Content/History/History1.htm" } @InProceedings{kay96, author = "Alan C. Kay", title = "The Early History of Smalltalk", crossref = "hopl2", pages = "511--579" } @InProceedings{steele&gabriel96, author = "Guy L. Steele and Richard P. Gabriel", title = "The Evolution of {Lisp}", crossref = "hopl2", pages = "233--309" } @Proceedings{hopl2preprints, title = "History of Programming Languages (HOPL-II) Preprints", booktitle = "History of Programming Languages (HOPL-II) Preprints", year = "1993", key = "HOPL-II", note = "SIGPLAN Notices 28(3)" } @Proceedings{hopl2, title = {History of Programming Languages}, booktitle = {History of Programming Languages}, year = 1996, key = {HOPL-II}, publisher = {ACM Press/Addison-Wesley} } @Book{levine+92, author = "John R. Levine and Tony Mason and Doug Brown", title = "lex&yacc", publisher = "O'Reilly & Associates", year = "1992", edition = "second", annote = "Discusses all aspects of using lex and yacc in practice." } @InProceedings{paysan93, author = "Bernd Paysan", title = "{ANS fig/GNU/??? Forth}", booktitle = "{Forth-Tagung}", year = "1993" } @TechReport{keppel+93, author = "David Keppel and Susan J. Eggers and Robert R. Henry", title = "A Case for Runtime Code Generation", institution = "Dept. of CS&E, U. of Washington, Seattle", year = "1991", number = "91-11-04" } @TechReport{sidebottom93, author = "Greg Sidebottom", title = "Compiling Constraint Logic Programming using Interval Computations and Branching Constructs", institution = "Simon Fraser University", year = "1993", OPTnumber = "" } @InProceedings{rau&glaeser81, author = "B. R. Rau and C. D. Glaeser", title = "Some Scheduling Techgniques and an Easily Schedulable Horizontal Architecture for High Performance Scientific Computing", crossref = "micro14", pages = "183--198", annote = "In contrast to other seminal papers this is not hard to read. It introduces modulo scheduling by first explaining DAG scheduling and vector loop scheduling. The second part of the paper discusses architectural support for schedulability. They propose delay elements (latches) at the ALU cross-bar meeting points. These delay elements and the register file should be organised as FIFOs to achieve register renaming." } @Proceedings{micro14, key = "MICRO-14", title = "14th Annual Microprogramming Workshop (MICRO-14)", booktitle = "14th Annual Microprogramming Workshop (MICRO-14)", year = "1981" } @InProceedings{bourdoncle93, author = "Fran{\c c}ois Bourdoncle", title = "Abstract Debugging of Higher-Order Imperative Languages", crossref = "sigplan93", pages = "46--55", annote = "Programs are annotated with ``always'' and ``eventually'' assertions. The debugger then tries to prove these assertions by abstract interpretation. A system was implemented for a subset of Pascal. The main problem with this approach seems to be that imprecise information (e.g. from aliases) will result in lots of warnings. Some solutions for this are proposed (e.g. pass-in/pass-out instead of reference parameters)." } @InProceedings{adams+93, author = "Norman Adams and Pavel Curtis and Mike Spreitzer", title = "First-class Data-type Representations in {\sc SchemeXerox}", crossref = "sigplan93", pages = "139--146", annote = "Extends Scheme with routines, that can define types as basic as lists. To make the whole thing efficient, the optimizer includes special simplifying transformations." } @InProceedings{grunwald+93, author = "Dirk Grunwald and Benjamin Zorn and Robert Henderson", title = "Improving the Cache Locality of Memory Allocation", crossref = "sigplan93", pages = "177--186", annote = "Compares several C memory allocation schemes wrt Paging and Cache behaviour. The best algorithms are also the fastest CPU-wise: BSD and QuickFit." } @InProceedings{barret&zorn93, author = "David A. Barret and Benjamin G. Zorn", title = "Using Lifetime Predictors to Improve Memory Allocation Performance", crossref = "sigplan93", pages = "187--196", annote = "The lifetime of dynamically allocated memory is predicted using a training run. It is predicted on a per-call-chain basis; the time unit is ``bytes allocated during the life time''. This information can be used to build more efficient allocators." } @InProceedings{boehm93, author = "Hans-Juergen Boehm", title = "Space Efficient Conservative Garbage Collection", crossref = "sigplan93", pages = "197--206", annote = "Discusses ways to reduce the problems of conservative garbage collectors: Blacklisting (not allocating memory from certain addresses) reduces the number of pointer misidentifications (especially those due to constant static data). Clearing dead areas of the stack avoids lengthening many lifetimes. The paper also recomends thinking about garbage collection when designing data structures, i.e., not introducing more connectivity than used by the program." } @InProceedings{flanagan+93, author = "Cormac Flanagan and Amr Sabry and Bruce F. Duba and Matthias Felleisen", title = "The Essence of Compiling with Continuations", crossref = "sigplan93", pages = "237--247", annote = "CPS compilers convert into CPS form, optimize the program and convert back. Isomorphic optimizations can be performed on the original program, saving the transformation to and from CPS form." } @InProceedings{pinter93, author = "Shlomit S. Pinter", title = "Register Allocation with Instruction Scheduling: A New Approach", crossref = "sigplan93", pages = "248--257", annote = "Performs postpass scheduling with a modified register allocator: The conflict graph of the register allocator contains all potential conflict edges, i.e. more than a prepass approach. The paper gives a few heuristics for removing edges if registers become a problem, but no results." } @InProceedings{huff93, author = "Richard A. Huff", title = "Lifetime-Sensitive Modulo Scheduling", crossref = "sigplan93", pages = "258--267", annote = "Modifies Modulo Scheduling to use an operation-driven scheduling strategy and gives heuristics that minimize register pressure. The algorithm produces tighter loops than Cydromes scheduler and less register pressure. 93\% of the measured loops are within 10 registers of the ideal." } @InProceedings{kolte&harrold93, author = "Priyadarshan Kolte and Mary Jean Harrold", title = "Load/Store Range Analysis for Global Register Allocation", crossref = "sigplan93", pages = "268--277", annote = "Load and Store ranges are subranges of live ranges that have separate costs and can be used for register allocation instead of live ranges. The idea was tested by compiling several small programs for processors with tiny register sets (1--8 registers). The spill code for the 8-register machine is reduced by -34\%--52\%." } @InProceedings{kerns&eggers93, author = "Daniel R. Kerns and Susan J. Eggers", title = "Balanced Scheduling: Instruction Scheduling when Memory Latency is Uncertain", crossref = "sigplan93", pages = "278--289", annote = "Instructions that can be executed in parallel with a chain of loads are distributed evenly across the loads, in order to achieve better behaviour on cache misses etc. Balanced Scheduling offers an speedup of 6\%--8\% on a machine, where all latencies (except load) are one, with the Perfect benchmarks as workload." } @InProceedings{warter+93, author = "Nancy J. Warter and Scott A. Mahlke and {Wen-mei} W. Hwu and B. Ramakrishna Rau", title = "Reverse If-Conversion", crossref = "sigplan93", pages = "290--299" } @InProceedings{ball&larus93, author = "Thomas Ball and James R. Larus", title = "Branch Prediction for Free", crossref = "sigplan93", pages = "300--313", annote = "Branch prediction without profiling: For loop branches predicting looping gives good results. This paper points out that many loop branches are not backward branches, and that control flow analysis is needed to recognize them. It also presents several heuristics for predicting non-loop branches, based on the branch opcode, properties of the successor blocks (whether a loop, call, return, use or store is in one of the successors), or whether the loop is based on a pointer comparison. These heuristics work have miss rates of (16--45\%), when they can be applied. They can be combined into a heuristic with 26\% miss rate that can be appied to 79\% (5-100\%) of the non-loop branches, resulting in overall miss rates of 19\% (1--41\%). Finally they grind an axe with the instructions per mispredicted branch metric \cite{fisher&freudenberger92}, but this section is not very clear." } @Proceedings{sigplan93, booktitle = "SIGPLAN '93 Conference on Programming Language Design and Implementation", title = "SIGPLAN '93 Conference on Programming Language Design and Implementation", year = "1993", key = "SIGPLAN '93", note = "SIGPLAN Notices 28(6)" } @Unpublished{vanhentenryck+91, author = "Van Hentenryck, Pascal and Vijay Saraswat and Yves Deville", title = "Constraint Processing in {{\tt cc({FD})}}", url = "ftp://parcftp.xerox.com/pub/ccp/ccfd/pldi-5.ps", year = "1991", annote = "A new basis for the finite domain part of CHIP: The basic system only contains domain variables; There are four ways to construct constraints: Indexical constraints (somewhat similar to Nicologs projection language), the cardinality operator, constructive disjunction, and extended asks." } @InCollection{ertl&krall95, author = {M. Anton Ertl and Andreas Krall}, title = {High Level Constraints over Finite Domains}, booktitle = {Constraint Processing}, publisher = {Springer LNCS~923}, year = {1995}, editor = {Manfred Meyer}, pages = {51--66} } @Article{vanroy94, author = {Van Roy, Peter}, title = {1983--1993: The Wonder Years of Sequential Prolog Implementation}, journal = {Journal of Logic Programming}, year = 1994, volume = {19,20}, pages = {385--441}, url = {http://www.hpl.hp.com/techreports/Compaq-DEC/PRL-RR-36.pdf}, url = {http://www.ps.uni-sb.de/Papers/abstracts/SequentialPrologImp.html} } @Proceedings{plsa94, title = "Programming Languages and System Architectures", booktitle = "Programming Languages and System Architectures", year = "1994", key = "PLSA", editor = "J{\"u}rg Gutknecht", publisher = "Springer LNCS~782", address = "{Z\"urich}" } @Article{peytonjones+93, author = "Simon L. {Peyton Jones} and John Hughes and John Launchbury", title = "How to Give a Good Research Talk", journal = sigplan, year = "1993", volume = "28", number = "11", pages = "9--12", month = nov } @InProceedings{hoffmann93, author = "Ulrich Hoffmann", title = "Static Stack Effect Analysis", booktitle = "EuroFORTH '93 conference proceedings", year = "1993", address = "Mari\'ansk\'e L\'azn\`e (Marienbad)" } @Article{silberman&ebcioglu93, author = "Gabriel M Silberman and Kemal Ebcio\u{g}lu", title = "An Architectural Framework for Supporting Heterogeneous Instruction-Set Architectures", journal = ieeecomputer, year = "1993", pages = "39--56", month = jun, annote = "Proposes mechanisms for executing old binaries on new, high performance machines. Includes an interesting section on exception handling." } @InProceedings{bringman+93, author = "Roger A. Bringman and Scott A. Mahlke and Richard E. Hank and John C. Gyllenhaal and {Wen-mei} W. Hwu", title = "Speculative Execution Exception Recovery Using Write-Back Suppression", crossref = "micro26", pages = "214--223", annote = "A hardware machanism for the problem of speculatively executing potentially excepting instructions. It presumes a superblock scheduling framework. After the exception has happened, the writeback of that instruction and all subsequent instructions whose speculation depth is at least as great as that of the excepting instruction are suppressed. After detecting that the exception is relevant, the hardware reexecutes the instructions whose writeback was suppressed. Exceptions from instructions from different basic blocks complicate matters a little more." } @Proceedings{micro26, key = "MICRO-26", title = "26th Annual International Symposium on Microarchitecture (MICRO-26)", booktitle = "26th Annual International Symposium on Microarchitecture (MICRO-26)", year = "1993" } @Proceedings{cc94, title = "Compiler Construction (CC '94)", booktitle = "Compiler Construction (CC '94)", year = "1994", key = "CC '94", publisher = "Springer LNCS~786", address = "Edinburgh", month = "April", } @Manual{dpans6-93, title = "Draft proposed American National Standard --- Forth (X3J14 dpANS6)", year = "1993", key = "ANS~Forth" } @Article{hayes92, author = "John R. Hayes", title = "User-Defined Local Variable Syntax with {ANS Forth}", journal = sigforth, year = "1992", volume = "4", number = "2", OPTpages = "19, 20, 26", annote = "Shows how to define a nice locals syntax using the ANS Forth locals wordset." } @InProceedings{diaz&codognet93, author = "Daniel Diaz and Phillippe Codognet", title = "A Minimal Extension of the {WAM} for {\tt clp(FD)}", pages = "774--790", booktitle = "International Conference on Logic Programming (ICLP)", year = 1993 } @InProceedings{jourdan&sola93, author = "Jean Jourdan and Thierry Sola", title = "The Versatility of Handling Disjunctions as Constraints", pages = "60--74", booktitle = "Programming Language Implementation and Logic Programming (PLILP)", year = 1993 } @InCollection{codognet+93, author = "Phillippe Codognet and Fran{\c{c}}ois Fages and Thierry Sola", title = "A Metalevel Compiler of {CLP(FD)} and Its Combination with Intelligent Backtracking", booktitle = "Constraint Logic Programming: Selected Research", publisher = "MIT Press", year = 1993, editor = "Fr{\'e}d{\'e}ric Benhamou and Alain Colmerauer", pages = "437--456" } @Article{klint81, author = "Paul Klint", title = "Interpretation Techniques", journal = spe, year = 1981, volume = 11, pages = "963--973", annote = "General discussion of interpreters. Empirical comparison of direct threading, indirect threading and token threading on PDP-11 and CYBER-73." } @Book{krasner83, title = "Smalltalk-80: Bits of History, Words of Advice", publisher = "Addison-Wesley", year = 1983, editor = "Glen Krasner" } @Book{debaere&vancampenhout90, author = "Eddy H. Debaere and Jan M. {Van Campenhout}", title = "Interpretation and Instruction Path Coprocessing", publisher = "The MIT Press", year = 1990, annote = "Good discussion about interpretation with big bibliography. They propose instruction path coprocessing as a means to speed up interpreters. An instruction path coprocessor is similar to a microcode sequencer that has the code to be interpreted as machine code and the machine code of the main processor as microcode." } @InProceedings{pittman87, author = "Thomas Pittman", title = "Two-Level Hybrid Interpreter/Native Code Execution for Combined Space-Time Efficiency", crossref = "sigplan87", pages = "150--152", annote = "Proposes the typical Forth way of speedup in Interpreters: Coding time-critical stuff in assembly language." } @Proceedings{sigplan87, title = "Symposium on Interpreters and Interpretive Techniques (SIGPLAN '87)", booktitle = "Symposium on Interpreters and Interpretive Techniques (SIGPLAN '87)", year = 1987, key = "SIGPLAN '87" } @Article{bell73, author = "James R. Bell", title = "Threaded Code", journal = cacm, year = 1973, volume = 16, number = 6, pages = "370--372" } @InCollection{brandis95, author = "Marc Brandis", title = "Register allocation using graph coloring", crossref = "comp.compilers", year = "1995", volume = "95-2", annote = "Discusses some papers on the topic. In Particular that he has implemented the algorithm of \cite{callahan&koblenz91}." } @Misc{comp.compilers, key = "{\tt c.c}", title = "{\tt comp.compilers}", booktitle = "{\tt comp.compilers}", howpublished = "Usenet Newsgroup; archives available from http://www.iecc.com/compilers/" } @InProceedings{poeial94, author = "Jaanus P{\"o}ial", title = "Forth and Formal Language Theory", crossref = "euroforth94", pages = "47--52", annote = "Shows that the stack effect notation is at least as powerful as context-free grammars and that it is better suited for specifying the syntax of Forth." } @Proceedings{euroforth94, title = "EuroForth~'94 Conference Proceedings", booktitle = "EuroForth~'94 Conference Proceedings", year = "1994", key = "EuroForth '94", address = "Winchester, UK", } @Article{golberg91, author = "David Goldberg", title = "What Every Computer Scientist Should Know About Floating-Point Arithmetic", journal = acmcs, year = "1991", volume = "23", number = "1", pages = "5--48" } @InProceedings{burger+95, author = {Robert G. Burger and Oscar Waddell and R. Kent Dybvig}, title = {Register Allocation Using Lazy Saves, Eager Restores, and Greedy Shuffling}, crossref = "sigplan95", pages = {130--138} } @Proceedings{sigplan95, booktitle = "SIGPLAN '95 Conference on Programming Language Design and Implementation", title = "SIGPLAN '95 Conference on Programming Language Design and Implementation", year = "1995", key = "SIGPLAN '95" } @Article{blake77, author = "Russell P. Blake", title = "Exploring a Stack Architecture", journal = ieeecomputer, year = "1977", volume = "10", number = "5", pages = "30--39", month = may } @InProceedings{ditzel&mclellan82, author = "David R. Ditzel and H. R. McLellan", title = "Register Allocation for Free: The {C} machine Stack Cache", pages = "48--56", booktitle = "Symposium on Architectural Support for Programming Languages and Systems", year = "1982" } @InProceedings{gallagher+94, author = "David M. Gallagher and William Y. Chen and Scott A. Mahlke and John C. Gyllenhaal and {Wen-mei} W. Hwu", title = "Dynamic Memory Disambiguation Using the Memory Conflict Buffer", crossref = "asplos94", pages = "183--193", annote = "Hardware support for run-time disambiguation. Loads that are moved in front of stores (that might access the same memory) become {\em preloads}. At the original location of the load a {\em check} instruction is inserted. If there was a conflict, the check branches to compiler-generated recovery code. The memory conflict buffer that supports this architecture is a cache-like structure that associates the destination register of a preload with the accessed address. Stores to the same address (and sometimes other stores and loads) set the conflict bit that makes the check branch to the recovery code. This structure detects all conflicts, but also gives false alarms sometimes. The number of false alarms can be kept low by setting the parameters (number of entries, associativity, ...) to appropriate values. The speedups are quite spectacular (more than 2.5 for alvinn and cmp on an 8-issue machine)." } @InProceedings{young&smith94, author = "Cliff Young and Michael D. Smith", title = "Improving the Accuracy of Static Branch Prediction Using Branch Correlation", crossref = "asplos94", pages = "232--241", annote = "This profiling-based method is quite different from the hardware schemes, because it differentiates between the paths on which a branch is executed. If the branch predictions along different paths are different, the branch (and part of the paths to it) is replicated. The improvements in prediction accuracy seem to be in the same league as Krall's results." } @InProceedings{calder&grunwald94, author = "Brad Calder and Dirk Grunwald", title = "Reducing Branch Costs via Branch Alignment", crossref = "asplos94", pages = "242--251", annote = "Branch alignment is the rearrangement of the basic block such that the costs of conditional branches are minimized. The best earlier algorithm greedily reduces the cost of one control flow edge at a time, starting with the most-executed edge. The algorithm of this paper exhaustively tries all possibilities for 15 edges at a time." } @InProceedings{engler&proebsting94, author = "Dawson R. Engler and Todd A. Proebsting", title = "DCG: An Efficient, Retargetable Dynamic Code Generation System", crossref = "asplos94", pages = "263--271", annote = "Generates code for the MIPS and SPARC. The interface used is {\tt lcc}'s code generation interface, so dynamic code is generated a procedure at a time, using C calling conventions. The code generator takes about 350 instructions to generate one instruction (about 100 times faster than gcc). This paper also presents some interesting examples where dynamic code generation provides great speedups over other approaches to solve the same problem (e.g., a general algorithm, or an interpreter)." } @Proceedings{asplos94, title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", year = "1994", key = "ASPLOS-VI" } @Book{weiss&smith94, author = "Shlomo Weiss and James E. Smith", title = "Power and PowerPC", publisher = "Morgan Kaufmann", year = "1994", annote = "Provides an overview of the Power and PowerPC architectures and looks at the Power1, Power2 and MPC~601 implementations. Finally, the authors look at memory and I/O sybsystems of machines employing these processors and compares the MPC~601 with the 21064 implementation of the Alpha architecture." } @InProceedings{wu&larus94, author = "Youfeng Wu and James R. Larus", title = "Static Branch Frequency and Program Profile Analysis", crossref = "micro94", pages = "1--11", annote = "The heuristics for branch prediction of \cite{ball&larus93} are not only used for predicting the branch direction, but also for predicting the branch probability and, consequently, the profile. If several heuristics predict a branch, they are combined using a mathematical method called the Dempster-Shafer theory of evidence. The results are evaluated by comparing the top $n$\% of static and dynamic profiles. The results are much better than for the heuristics used in \cite{wall91pldi}." } @InProceedings{schlansker+94, author = "Michael Schlansker and Vinod Kathail and Sadun Anik", title = "Height Reduction of Control Recurrences for {ILP} Processors", crossref = "micro94", pages = "40--51", annote = "Height reduction is applied to recurrences on which branches (in particular loop exit branches) depend." } @TechReport{schlansker&kathail93, author = "Michael Schlansker and Vinod Kathail", title = "Acceleration of Algebraic Recurrences on Processors with Instruction Level Parallelism", institution = "HP Laboratories", year = "1993", type = "technical report", number = "HPL-93-55", note = "A shorter version appeared in \cite{bannerjee94}.", annote = "The associative and distributive laws are applied to reduce recurrence (cyclic data flow paths) heights in (DO) loops. The basic idea is to replace some references to loop-variant variables with the expression assigned to that variable. The resulting big expressions can then be transformed to minimize the critical path length, usually in a way requiring more resources. This paper introduces blocked back-substitution: The loop is unrolled several times, and only the loop-carried copies of the variables are back-substituted, the others are computed using the slow, but resource-saving method. The paper explains how to apply blocked back-substitution to first-order and higher-order recurrences and gives formulae for the resulting recurrence path length and the needed resources. For first-order recurrences blocked-back-substitution works well, allowing the exploitation of unlimited parallelism (assuming infinite loop trip counts) while increasing the operation count just by a constant factor." } @Proceedings{bannerjee94, title = "Languages and Compilers for Parallel Computing", year = "1994", editor = "Uptal Bannerjee" } @InProceedings{noonburg&shen94, author = "Derek B. Noonburg and Jonathan P. Shen", title = "Theoretical Modeling of Superscalar Processor Performance", crossref = "micro94", pages = "52--62" } @InProceedings{rau94, author = "B. Ramakrishna Rau", title = "Iterative Modulo Scheduling: An Algorithm for Software Pipelining", crossref = "micro94", pages = "63--74", annote = "Analyses the compile-time of modulo scheduling, both empirically and with respect to the computational complexity. The paper also describes how to implement modulo scheduling for fast compilation. For the loops (Perfect Club, Spec, Livermore Fortran Kernels) and the machine model (a slightly modified Cydra~5) used in the paper, modulo scheduling scheduled each instruction only 1.59 times." } @InProceedings{govindarajan+94, author = "R. Govindarajan and Erik R. Altman and Guang R. Gao", title = "Minimizing Register Requirements under Resource-Constrained Rate-Optimal Software Pipelining", crossref = "micro94", pages = "85--94", annote = "The problem is formalized as an optimization problem and a search algorithm is used to solve it. The results are compared with other algorithms. The method employed is too slow for practical use, but since it is optimal, it provides a good target for other methods." } @InProceedings{golden&mudge94, author = "Michael Golden and Trevor Mudge", title = "A Comparison of Two Pipeline Organizations", crossref = "micro94", pages = "153--161", annote = "Compares Pipeline organizations where the ALU and branch resolution are in the first execute stage (i.e., load-use-delays) and organizations where these functions are in the last execute stage (i.e., address-generation-load delays and higher branch miss penalties). Interestingly, with good branch prediction the latter organization performs better even on code that is scheduled for the first organization. The explanation given is that a computed register is often used in several loads, and the delay is only incurred once, if at all." } @InProceedings{razdan&smith94, author = "Rahul Razdan and Michael D. Smith", title = "A High-Performance Microarchitecture with Hardware-Programmable Functional Units", crossref = "micro94", pages = "172--180", annote = "A RISC processor is extended with a programmable functional unit (similar to an FPGA) that acts like other units, e.g., the ALU. The compiler automatically extracts stuff from the source program that can be accelerated with the PFU and translates it into hardware that can be accessed via PFU instructions. This approach results in speedups of 1.06--1.91 (1.06--1.16 without eqntott) for the SpecInt92 benchmarks." } @InProceedings{hoogerbrugge&corporaal94, author = "Jan Hoogerbrugge and Henk Corporaal", title = "Register File Port Requirements of Transport Triggered Architectures", crossref = "micro94", pages = "191--195", annote = "The rationale for transport triggered architectures is that the compiler will allocate resources like ports and busses better than is usually done by hardware. This paper supports this claim empirically for register ports. With only one read and one write port to the register file they achieve 1.98 operations per cycle (however, FUs and buses are present in abundance). With 3 read and write ports 3.63 operations/cycle are achieved, pretty close to the maximum of 3.8 with 6 read and write ports. Overall, transport triggered architectures need 0.5 read and 0.35 write ports per operation, compared to 2 read and 1 write port on a conventional architecture." } @Proceedings{micro94, title = "International Symposium on Microarchitecture (MICRO-27)", booktitle = "International Symposium on Microarchitecture (MICRO-27)", year = "1994", key = "MICRO-27" } @InProceedings{bailey&sotudeh93, author = "C. Bailey and R. Sotudeh", title = "Quantitative Assessment of Machine-Stack Behaviour for Better Computer Performance", booktitle = "Proceedings of the 9th international Conference on Mathematical and Computer Modelling", year = "1993", annote = "Presents some quantitative evidence on the stack usage behaviour of Forth programs, unfortunately only based on four small benchmarks." } @Article{koopman94, author = "Philip Koopman, Jr.", title = "A Preliminary Exploration of Optimized Stack Code Generation", journal = jfar, year = "1994", volume = "6", number = "3", pages = "241--251", url = "http://www.cs.cmu.edu/~koopman/stack_compiler/index.html", annote = "This paper attacks the problem of optimizing code with local variables for stack machines. It starts with gcc's intermediate code and turns every pseudoregister reference into a local variable reference. Then it tries to convert local variable accesses to stack accesses. First the distances between accesses to the same variable are determined, then the values are allocated to the stack starting with the shortest distances. This algorithm is used for basic blocks and results in removing 91\%--100\% of the redundant local variable accesses. For global stack scheduling, a few experiences from manual optimization are reported." } @Article{hayes&lee89, author = "John Hayes and Susan Lee", title = "The architecture of the {SC32} {Forth} Engine", journal = jfar, year = "1989", volume = "5", number = "4", pages = "493--506", annote = "Describes the SC32 (aka FRISC3) processor in detail. In contrast to earlier FRISC designs, it has better support for loads, stores and literals and handles stack buffer overflow with hardware, reading/writing one cell at a time." } @Article{chan+94, author = "Yin Chan and Ashok Sundarsanam and Andrew Wolfe", title = "The Effect of Compiler-Flag Tuning on SPEC Benchmark Performance", journal = can, year = "1994", volume = "22", number = "4", pages = "60--70", annote = "Discusses the ways in which SPEC measurements are made and their problems. In spite of investing great effort, the authors were not able to recreate all the results reported by the manufacturer. With more realistic compiler flags, they achieved 78\%--95\% of the manufacturer-reported performance. Further significant reductions in performance were measured under the realistic assumptions of using centralized file servers and producing code for older versions of the architecture." } @Article{charlesworth81, author = "Alan E. Charlesworth", title = "An Approach to Scientific Array Processing: The Architectural Design of the {AP-120B/FPS-164} Family", journal = ieeecomputer, year = "1981", pages = "18--27", month = sep, annote = "Describes the architecture of two early LIW machines designed for scientific and signal processing. It could start one FP addition, one FP multiplication, one load from or store to each of the two memories and one read from and one write to each of the two register files, one address computation and one branch per cycle. The functional units were fully pipelined with relatively short pipelines for good scalar performance. Inner loops of applications are optimized using software pipelining, which is demonstrated using the dot product, the other code uses basic-block scheduling." } @Article{pelli87, author = "Deni G. Pelli", title = "Programming in Postscript", journal = "Byte", year = "1987", volume = "12", number = "5", pages = "185--202", month = may, annote = "Describes what you can do on Laser printers and typesetting machines by programming Postscript directly." } @Article{seybold86, key = "Seybold", title = "PostScript: Can It Cut the Mustard?", journal = "The Seybold Report on Publishing Systems", year = "1986", volume = "15", number = "12" } @InProceedings{dean&chambers94, author = "Jeffrey Dean and Craig Chambers", title = "Towards Better Inlining Decisions Using Inlining Trials", pages = "273--282", booktitle = "Conference on Lisp and Functional Programming", year = "1994", annote = "Inlining decisions are based on actually trying and evaluating the inlining. The results are stored in a database that is used across compilations and across programs. The database is indexed with information derived from type group analysis, in order to make the result of one trial as widely applicable as possible." } @Article{rau&fisher93, author = "B. Ramakrishna Rau and Joseph A. Fisher", title = "Instruction-Level Parallel Processing: History, Overview, and Perspective", journal = "Journal of Supercomputing", year = "1993", volume = "7", number = "1/2", pages = "9-50", note = "Reprinted in \cite{rau&fisher93book}", annote = "Gives an overview of historical developments in the area, concentrating on the VLIW architectures and the compilation techniques developped for them in the 1980s. Contains an extensive bibliography." } @Article{lowney+93, author = "P. Geoffrey Lowney and Stefan M. Freudenberger and Thomas J. Karzes and W. D. Liechtenstein and Robert P. Nix and John S. O'Donnel and John C. Ruttenberg", title = "The {Multiflow} Trace Scheduling Compiler", journal = "Journal of Supercomputing", year = "1993", volume = "7", number = "1/2", pages = "51--142", note = "Reprinted in \cite{rau&fisher93book}", annote = "Much of what is presented here has been described already in some form in \cite{ellis85}. One advantage of the Multiflow compiler over Bulldog is that it does not duplicate code when moving it up across an IF-statement. Another original contribution of this paper is a comparison of the Multiflow Trace 14/300 with contemporary competitors, the Convex C210 and the MIPS M/1000. Not surprisingly, the Multiflow outperforms its competitors on scalar FP code, the Convex is a little better on vector code, and the MIPS is a little better on scalar integer code. The paper also contains an evaluation of the compiler, in particular of compilation speed." } @Article{beck+93, author = "Gary R. Beck and David W.L. Yen and Thomas L. Anderson", title = "The {Cydra~5} Minisupercomputer: Architecture and Implementation", journal = "Journal of Supercomputing", year = "1993", volume = "7", number = "1/2", pages = "143--180", note = "Reprinted in \cite{rau&fisher93book}" } @Article{dehnert&towle93, author = "James E. Dehnert and Ross A. Towle", title = "Compiling for the {Cydra~5}", journal = "Journal of Supercomputing", year = "1993", volume = "7", number = "1/2", pages = "180--227", note = "Reprinted in \cite{rau&fisher93book}", annote = "This compiler employs software pipelining for exploiting instruction-level parallelism." } @Article{hwu+93, author = "{Wen-mei} W. Hwu and others", title = "The Superblock: An Effective Technique for {VLIW} and Superscalar Compilation", journal = "Journal of Supercomputing", year = "1993", volume = "7", number = "1/2", pages = "229--248", note = "Reprinted in \cite{rau&fisher93book}" } @Article{schuette&shen93, author = "Michael A. Schuette and John P. Shen", title = "Instruction-Level Experimental Evaluation of the {Multiflow TRACE 14/300 VLIW} Computer", journal = "Journal of Supercomputing", year = "1993", volume = "7", number = "1/2", pages = "249--271", note = "Reprinted in \cite{rau&fisher93book}" } @Book{rau&fisher93book, title = "Instruction-level parallelism", publisher = "Kluwer Academic Publishers", year = "1993", editor = "B. Ramakrishna Rau and Joseph A. Fisher", note = "Reprint of The Journal of Supercomputing, 7(1/2)" } @Article{click&cooper95, author = {Cliff Click and Keith D. Cooper}, title = {Combining Analyses, Combining Optimizations}, journal = toplas, year = {1995}, volume = {17}, number = {2}, pages = {181--196}, annote = {A theoretical paper on combining various optimizations in one phase, in order to eliminate phase-ordering problems and get better code.} } @InProceedings{baden90a, author = "Wil Baden", title = "Virtual Rheology", booktitle = "FORML'90 Proceedings", year = "1990", annote = "Explains how to construct any control flow graph using Forth's control-flow words." } @InProceedings{baden90b, author = "Wil Baden", title = "How Many Forks for Deep Spaghetti", booktitle = "FORML'90 Proceedings", year = "1990", annote = "Shows that two branches are sufficient to produce unstructured code and enumerates the control structures that can be created using two branches." } @InProceedings{baden90c, author = "Wil Baden", title = "How to Uncook Spaghetti", booktitle = "FORML'90 Proceedings", year = "1990", annote = "Unstructured constructs can be converted to structured code using either code duplication or compulsion (introducing new tests). The paper sketches a method for doing it. The author concludes: ``In general the cure isn't noticably better than the disease''." } @InProceedings{baden90d, author = "Wil Baden", title = "Spaghetti Restructured", booktitle = "FORML'90 Proceedings", year = "1990", annote = "Exemplifies the elimination of unstructured code using Flynn's problem." } @InProceedings{proebsting95, author = "Todd A. Proebsting", title = "Optimizing an {ANSI~C} Interpreter with Superoperators", crossref = "popl95", pages = "322--332", annote = "Interpreter performance is optimized by combining operators during code generation, when they are still organized as trees. So a different, optimized interpreter is used for each program. Speedups of 1.8--3.1 are achieved, but this is probably strongly dependent on the original set of operators. The paper uses lccs intermediate code operators \cite{fraser&hanson91a}." } @Proceedings{popl95, booktitle = "Principles of Programming Languages (POPL '95)", title = "Principles of Programming Languages (POPL '95)", year = "1995", key = "POPL '95" } @Misc{beuster95, author = "Bernd Beuster", howpublished = "Usenet posting in de.comp.lang.forth", year = "1995", month = may, annote = "A hand-tuned indirect threaded Forth interpreter needs 2.7s for the Siev benchmark on a 486DX2/66." } @Book{smith92until, author = "Norman Smith", title = "Write Your Own Programming Language Using {C++}", publisher = "Wordware Publishing", year = "1992", note = "ISBN 1-55622-264-5" } @InProceedings{norris&pollock93, author = "Cindy Norris and Lori. L. Pollock", title = "A Scheduler-Sensitive Global Register Allocator", booktitle = "Supercomputing'93", year = "1993", url = "ftp://www.eecis.udel.edu/pub/people/pollock/SSG.ps", annote = "Like \cite{pinter93}, their global register allocator builds a maximal interference graph, i.e., a graph that contains all intereferences possible in various schedules. The register allocator does not introduce antidependences, and therefore provides maximum scheduling freedom. If the register allocator thinks it will run out of registers, it adds dependences to reduce the number of interferences. The paper empirically studies various heuristics for adding dependences, applied at various stages of the register allocator. The best one introduces dependences that remove the maximum number of interferences already before building the interference graph (based on a count of live values). With this heuristic, scheduler-sensitive register allocation is a little better than integrated prepass scheduling for 20 registers or more and a little worse for fifteen registers or less. Unfortunately they do not compare these methods with plain prepass scheduling. " } @InProceedings{jourdan+95, author = "St\'ephan Jourdan and Pascal Sainrat and Diniel Litaize", title = "Exploring Configurations of Functional Units in an Out-Of-Order Superscalar Processor", crossref = "isca95", pages = "117--125", annote = "Using trace-based simulation, the authors measured varying configurations of superscalar processors, starting with the degree, then continuing with the instruction window size, integer units and data cache ports, variants of specialization for the integer units (with and without shifter) and floating-point units. " } @InProceedings{ando+95, author = "Hideki Ando and Chikako Nakanishi and Tetsuya Hara and Masao Nakaya", title = "Unconstrained Speculative Execution with Predicated State Buffering", crossref = "isca95", pages = "126--137", annote = "Instructions are not simply marked as speculative, but already with the predicate they depend upon; if such an instruction is executed before the predicate is available, the result is marked with the predicate and stored in a shadow register instead of the main register, similar to boosting. One shadow register file suffices in the context of their compiler, which allocates register to avoid conflicts. As soon as the predicate becomes available, the shadow register either becomes the main register (if the predicate is true), or is dropped (if it is false). Exception recovery is performed by hardware. The authors have develooped compiler techniques based on region scheduling to exploit this architectural feature and present empirical results." } @InProceedings{mahlke+95, author = "Scott A. Mahlke and Richard E. Hank and James E. McCormick and David I. August and {Wen-mei} W. Hwu", title = "A Comparison of Full and Partial Predicated Execution Support for {ILP} Processors", crossref = "isca95", pages = "138--149", annote = "Compares an architecture where every instruction is predicated with another version of the architecture that has only a conditional move. On a degree-8 machine with only one branch/cycle conditional moves provide 30\% speedup, and full predication provides 30\% over conditional moves." } @InProceedings{simone+95, author = "M. Simone et al.", title = "Implementation Trade-Offs in Using a Restricted Data Flow Architecture in a High Performance RISC Microprocessor", crossref = "isca95", pages = "151--162", annote = "Discusses an aggressive out-of-order degree-4 superscalar implementation of the SPARC architecture. Interestingly they implement only 4 register windows. Their instruction window can contain 64 instructions; 38 integer and 48 32-bit FP registers are available for register renaming. The reservation stations for the integer, address generation and FP units each contain 8 entries, the load/store unit 12. They also discuss the algorithm for selecting instructions from the reservation stations." } @InProceedings{diep+95, author = "Trung A. Diep and Cristopher Nelson and John Paul Shen", title = "Performance Evaluation of the PowerPC 620 Microarchitecture", crossref = "isca95", pages = "163--174", annote = "Presents empirical data about the utilization and effectiveness of various parts of the PPC~620 microarchitecture." } @InProceedings{bodin&Seznec95, author = "Fran\c{c}ois Bodin and Andr\'e Seznec", title = "Skewed Associativity Enhances Performance Predictability", crossref = "isca95", pages = "265--271", annote = "Skewed associative caches map a memory block into one line in one bank of the cache and into a different line in a different bank. If the mapping functions are chosen right, memory locations conflicting in one bank probably do not conflict in a different bank, reducing conflict misses." } @InProceedings{young+95, author = "Cliff Young and Nicolas Gloy and Michael D. Smith", title = "A Comparative Analysis of Schemes for Correlated Branch Prediction", crossref = "isca95", pages = "276--286", annote = "This excellent paper first introduces a model for branch prediction: A prediction scheme divides a program execution into substreams and feeds each substream to a predictor. This framework is then used to analyse and compare existing branch prediction mechanisms, notably hardware- and profiling-based \cite{young&smith94} branch-correlation schemes. The differences are isolated and analysed empirically: Path history provides slightly better performance than pattern history, aliasing (table conflicts) significantly decreases performance, correlation across calls and returns plays a significant role in several benchmarks. The difference between a static predictor and a dynamic 2-bit adapting predictor is noticably in only few branches, but the misses caused by these branches are significant, sometimes favouring adaptivity, sometimes favouring static prediction." } @InProceedings{calder&grunwald95, author = "Brad Calder and Dirk Grunwald", title = "Next Cache Line and Set Prediction", crossref = "isca95", pages = "287--296", annote = "Instead of storing the full target address in a branch target buffer, it is cheaper to store just the cache location of the branch target, enabling target buffers with more entries (e.g., 1024 instead of 128 at the same cost) and better performance." } @InProceedings{conte+95, author = "Thomas M. Conte and Kishore N. Menezes and Patrick M. Mills and Burzin A. Patel", title = "Optimization of Instruction Fetch Mechanisms for High Issue Rates", crossref = "isca95", pages = "333--344" } @InProceedings{uhlig+95, author = "Richard Uhlig and David Nagle and Trevor Mudge and Stuart Sechrest and Joel Emel", title = "Instruction Fetching: Coping with Code Bloat", crossref = "isca95", pages = "345--356", annote = "Claims that the SPEC benchmarks are not representative for current applications with respect to instruction cache miss rate, proposes a new set of programs for instruction cache benchmarking and compares them empirically in this respect to the SPEC programs." } @InProceedings{lee+95, author = "Dennis Lee and Jean-Loup Baer and Brad Calder and Benjamin Grunwald", title = "Instruction Cache Fetch Policies for Speculative Execution", crossref = "isca95", pages = "357--367", annote = "What should be done on an instruction cache miss for speculative instructions? If we fetch from main memory, what should be done if the speculation turns out to be wrong during the access? If the latency is large, speculative I-cache misses should not be served. If the latency is small, the speculative fetch should be performed; if the speculation is wrong, execution of the right path should be resumed without waiting for the wrong miss to complete." } @InProceedings{austin+95, author = "Todd M. Austin and Dionisios Pnevmatikatos and Gurindar S. Sohi", title = "Streamlining Data Cache Access with Fast Address Calculation", crossref = "isca95", pages = "369--380", annote = "Tries to shorten the average load latency by predicting the effective address by or-ing instead of adding the components together. Software support can increase the number of successful predictions. The number of correct predictions is surprisingly high, surpassing 50\% for most benchmarks even without software support and 95\% for most integer benchmarks with software support." } @InProceedings{wang+95, author = "Hong Wang and Tong Sun and Qing Yang", title = "{CAT} --- Caching Address Tags. A Technique for Reducing Area Cost of On-chip Caches", crossref = "isca95", pages = "381--390", annote = "Many cache entries have the same few tags. This redundancy can be exploited by caching the tags themselves in a small cache. Empirical data suggests that 32 entries are sufficient for tags for a 16K or larger direct-mapped cache. The CAT reduces the size of the tag area by a factor of 2--6. If a CAT entry has to be replaced, all cache lines refferring to that tag have to be invalidated; the paper does not discuss how to do this in a write-back cache." } @InProceedings{tullsen+95, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Simultaneous Multithreading: Maximizing On-Chip Parallelism", crossref = "isca95", pages = "392--403" } @InProceedings{sohi+95, author = "Gurindar S. Sohi and Scott E. Breach and T. N. Vijaykumar", title = "Multiscalar Processors", crossref = "isca95", pages = "414--425", annote = "Continues the work of \cite{franklin&sohi92}." } @Proceedings{isca95, title = "$22^{nd}$ Annual International Symposium on Computer Architecture", booktitle = "$22^{nd}$ Annual International Symposium on Computer Architecture", year = "1995", key = "ISCA 22", } @Article{kogge82, author = "Peter M. Kogge", title = "An Architectural Trail to Threaded-Code Systems", journal = ieeecomputer, year = "1982", pages = "22--32", month = mar, annote = "Explains the design of (a classical implementation of) Forth, starting with threaded code, then adding the parameter stack, constants, variables, control structures, dictionary, outer interpreter and compiler." } @Article{epstein&gilliatt85, author = "Arnold Epstein and Claire H. Gilliat", title = "The {MAGIC/L} Programming Language", journal = jfar, year = "1985", volume = "3", number = "2", pages = "9--21", note = "1985 Rochester Forth Conference" } @InProceedings{briggs&cooper94, author = {Preston Briggs and Keith D. Cooper}, title = {Effective Partial Redundancy Elimination}, crossref = {sigplan94}, pages = {159--170} } @InProceedings{knoop+94, author = {Jens Knoop and Oliver R{\"u}thing and Bernhard Steffen}, title = {Partial Dead Code Elimination}, crossref = {sigplan94}, pages = {147--158} } @Proceedings{sigplan94, booktitle = "SIGPLAN '94 Conference on Programming Language Design and Implementation", title = "SIGPLAN '94 Conference on Programming Language Design and Implementation", year = "1994", key = "SIGPLAN '94" } @Book{fraser&hanson95, author = {Christopher Fraser and David Hanson}, title = {A Retargetable C compiler: Design and Implementation}, publisher = {Benjamin/Cummings Publishing}, year = {1995}, ISBN = {0-8053-1670-1} } @InProceedings{stanley&wedig87, author = "Timothy J. Stanley and Robert G. Wedig", title = "A Performance Analysis of Automatically Managed Top of Stack Buffers", crossref = "isca87", pages = "272--281", annote = "They propose the top-of-stack buffer as special purpose cache for accesses to memory near the stack pointer in a conventional architecture. They look at three different algorithms for managing the buffer, some of which utilize otherwise unused memory cycles to manage the buffer proactively, avoiding delays later. Data based on the dhrystone benchmark are presented." } @Proceedings{isca87, key = "ISCA-14", booktitle = "The $14^{th}$ Annual International Symposium on Computer Architecture (ISCA)", title = "The $14^{th}$ Annual International Symposium on Computer Architecture (ISCA)", year = "1987", address = "Pittsburgh, Pennsylvania", organization = "IEEE Computer Society TCCA and ACM SIGARCH", note = "{\em Computer Architecture News,} 15(2), June 1987", month = jun # " 2--5,", } @Article{kanner+65, author = "H. Kanner and P. Kosinski and C. L. Robinson", title = "The structure of yet another {ALGOL} compiler", journal = cacm, volume = "8", number = "7", pages = "427--438", month = jul, year = "1965", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Sun Sep 18 23:35:40 1994", bibsource = "ftp://ftp.ira.uka.de/pub/bibliography/Compiler/bevan.bib and ftp://ftp.ira.uka.de/pub/bibliography/Compiler/Compiler.Lins.bib", abstract = "A high-speed ``top down'' method of syntax analysis which completely eliminates ``back-up'' of the source string has been implemented in a convenient macro-language. A technique of simulation at compile time of the use of a conventional run-time stack enables the generation of code for expressions which minimizes stores, fetches and stack-pointer motion at run time, while properly treating recursion and side effects of procedures. Block structure and recursion are handled without need for interpretive methods at run time. The ``context problem'' in the transmission to recursive procedures of parameters ``called by name'' is solved in a manner which permits the handling of common cases of simple expressions and array identifiers with particular efficiency.", checked = "19940407", sjb = "Contains two good pieces of advice: (1) Do not bother to mechanism those operations which are easily performed by humans. (2) Do not perform at run time any bookkeeping operations that can reasonably be performed at compile time. The former led to the decision to writing the lexer/parser as set of recursive routines and the latter to the removal of any form of ``go to'' interpreter \cite{Irons:Feurzig:cacm:1961}. Notes that the ALGOL report uses syntax to distinguish between arithmetic and boolean expressions but that this causes problems for their syntax analyser. The solution to the problems was to unify the syntax and make differentiating between the two types of expression a typing problem. Rest of the paper details solutions to the following areas: labels and multiple assignments; run time lists for {\bf own} variables; dealing with block structure using the symbol table; code generation for expressions; dealing with switches and procedures.", annote = "Discusses several technical issues in Algol~60 implementation; some of which are specific to the language, but others are still interesting today (e.g., how to deal with common prefixes in syntax analysis). They apparently generate code for an accumulator machine with several index registers without stack addressing modes. The code generation logically works with a stack in memory; however, the compiler emulates most stack pointer updates at compile-time and translates stack accesses into indexed accesses." } @Article{dewar75, author = {Robert B.K. Dewar}, title = {Indirect Threaded Code}, journal = cacm, year = {1975}, volume = {18}, number = {6}, month = jun, pages = {330--331}, annote = {Demonstrates a version of indirect threaded code with multiple code fields; it contrasts this with a version of direct threading that has a separate routine corresponding to each code field of the indirect threaded code (i.e., no immediate parameters).} } @Book{goldberg&robson83, author = {Adele Goldberg and David Robson}, title = {Smalltalk-80: The Language and its Implementation}, publisher = {Addison-Wesley}, year = {1983} } @Manual{ansforth94, title = "American National Standard for Information Systems: Programming Languages: Forth", key = "ANS~Forth", organization = "American National Standards Institute", year = 1994, note = "Document X3.215-1994", url = {http://www.complang.tuwien.ac.at/forth/dpans-html/dpans.htm} } @Unpublished{hayes89, author = "John Hayes", title = "Design Tradeoffs in a Top of Stack Cache", note = "Unpublished", year = "1989" } @Article{baden95, author = "Wil Baden", title = "Pinhole Optimization", journal = "Forth Dimensions", year = 1995, volume = 17, number = 2, pages = "29--35" } @InProceedings{vitek&horspool96, author = {Jan Vitek and R. Nigel Horspool}, title = {Compact Dispatch Tables for Dynamically Typed Object Oriented Languages}, crossref = {cc96}, pages = {309--325} } @InProceedings{proebsting&whaley96, author = {Todd A. Proebsting and Benjamin R. Whaley}, title = {One-Pass, Optimal Tree Parsing --- With or Without Trees}, crossref = {cc96}, pages = {294--308} } @Proceedings{cc96, title = "Compiler Construction (CC'96)", booktitle = "Compiler Construction (CC'96)", year = "1996", key = "CC'96", editor = "Tibor Gyim\'{o}thy", OPTvolume = "1060", OPTseries = "LNCS", publisher = "Springer LNCS~1060", address = "Link{\"o}ping" } @InProceedings{evers+96, author = "Marius Evers and Po-Yung Chang and Yale N. Patt", title = "Using Hybrid Branch Predictors to Improve Branch Prediction Accuracy in the Presence of Context Switches", crossref = "isca96", pages = "3--11", annote = "They propose a new mechanism for selecting between different branch predictors: For every BTB entry and predictor they introduce a two-bit counter that records how well this predictor did for this branch relative to the other predictors. A predictor using this mechanism outperforms other predictors for the SPECint92 benchmarks. They also apply their multi-hybrid predictor to traces that emulate context switching by regularly flushing the predictor. It also outperforms other predictors there: Some predictors (e.g., 2-bit counters) warm-up quickly, other predictors are more accurate, but have a long warm-up phase." } @InProceedings{gloy+96, author = "Nicolas Gloy and Cliff Young and J. Bradley Chen and Michael D. Smith", title = "An Analysis of Dynamic Branch Prediction Schemes on System Workloads", crossref = "isca96", pages = "12--21", annote = "The paper evaluates the effectiveness of several branch predictors for full-system traces (i.e., with kernel branhces and other processes). These traces contain many more static branches and therefore increase aliasing. Consequently, for a given implementation cost, schemes that use shorter histories do better relative to user-only traces. The paper also examines the practice of modeling context switching and kernel activity by regularly flushing the predictor: It concludes that this model is misleading, because it does not capture the differences in the organization and size of the schemes; it assumes that all models have the same contention." } @InProceedings{sechrest+96, author = "Stuart Sechrest and Chih-Chieh Lee and Trevor Mudge", title = "Correlation and Aliasing in Dynamic Branch Predictors", crossref = "isca96", pages = "22--32", annote = "Examines the performance of various predictors for traces of larger programs. They conclude that PAs schemes perform relatively better than studies based on SPEC benchmarks indicate, especially if the full design space of these schemes is explored." } @InProceedings{nayfeh+96, author = "Basem A. Nayfeh and Lance Hammond and Kunle Olukotun", title = "Evaluation of Design Alternatives for a Multiprocessor Microprocessor", crossref = "isca96", pages = "67--77", annote = "The basic assumption here is that, in the future, several processors will reside on one chip. How should they be interfaced to the memory hierarchy? Should they share the first-level cache, the second-level cache, or only memory? For a quite unrealistic model (sharing caches by having an $n\times n$ crossbar), they conclude that a shared-L1 architecture is best, even for multiprogramming workloads where no user-data is shared between processors. Assuming a higher L1 latency for the shared L1 cache makes the shared-memory scheme best for multiprogramming workloads. For data-sharing application, the shared-L1 scheme outperforms the others (except for MP3D, where the shared-L2 scheme is best)." } @InProceedings{burger+96, author = "Doug Burger and James R. Goodman and Alain K{\"a}gi", title = "Memory Bandwidth Limitations of Future Microprocessors", crossref = "isca96", pages = "78--89", annote = "The widening gap between processor speeds and memory speed is currently often closed with techniques that hide latency, but consume bandwidth (e.g., prefetching). This paper predicts that, with this trend, bandwidth will soon become a problem. We would have to create chips with thousands or tenthousands of pins that have to be drive at a GHz or more. They also present empirical data on the effectiveness of caches at multiplying external bandwidth; they compare usual (LRU replacement, 32-byte lines) caches with optimal caches. They conclude that, in the short run, smarter caches (in particular, caches with shorter lines) will alleviate this problem, while in the long run, all memory will be on-chip." } @InProceedings{seznec96, author = "Andr\'e Seznec", title = "Don't use the page number, but a pointer to it", crossref = "isca96", pages = "104--113", annote = "Page numbers are currently stored both in the TLB and in the caches. Instead of storing them several times the author propses to save chip area by keeping them only once in a page number table, and to use an index into this table in the TLB and the caches. He shows how this can be implemented for several combinations of virtual and physical indexing and tagging." } @InProceedings{Juan+96, author = "Toni Juan and Tomas Lang and Juan J. Navarro", title = "The Difference-Bit Cache", crossref = "isca96", pages = "114--120", annote = "Proposes a scheme for a two-way set-associative chache that is faster than other schemes. The problem with this scheme seems to be that it is only fast for virtual-tagged caches." } @InProceedings{tullsen+96, author = "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer and Henry M. Levy and Jack. L. Lo and Rebecca L. Stamm", title = "Exploiting Choice: Instruction Fetch and Issue on an Implementable Simultaneous Multithreading Processor", crossref = "isca96", pages = "191--202", annote = "An SMT processor exploits the resources available in a superscalar processor better by running several threads on the processor simultaneously. All the resources are shared between the threads, including the register set (which is larger, however), except for the program counter (and a return stack), retirement and related stuff. This paper concentrates on instruction fetching, which seems to have been identified as bottleneck in an earlier paper. They propose and measure several heuristics for selecting the thread from which to fetch. The best one is selecting the thread that has the least instructions in the instruction queues. With this heuristic an 8-thread 8-issue architecture achieves a throughput of 5.4 instructions/cycle." } @InProceedings{hara+96, author = "Tetsuya Hara and Hideki Ando and Chikako Nakanishi and Masao Nakaya", title = "Performance Comparisons of ILP Machines with Cycle Time Evaluation", crossref = "isca96", pages = "213--224", annote = "Presents a VLIW with a meachnism called predicating that looks quite similar to boosting." } @Proceedings{isca96, title = "$23^{rd}$ Annual International Symposium on Computer Architecture", booktitle = "$23^{rd}$ Annual International Symposium on Computer Architecture", year = "1996", key = "ISCA 23", } @TechReport{moore&leach70, author = "Charles H. Moore and Geoffrey C. Leach", title = "FORTH -- A Language for Interactive Computing", institution = "Mohasco Industries, Inc.", year = "1970", address = "Amsterdam, NY", url = "http://www.ultratechnology.com/F70POST.ZIP", url = "http://www.ultratechnology.com/4th_1970.html", annote = "Describes Forth, as it was in 1970. There are surprising differences from and surprising similarities with modern Forth systems. The system they describe uses text interpretation instead of threaded code for definitions, although there is already a code field, i.e., the foundation for indirect threading. During the interpretation of a definition, only words defined earlier are visible, like in modern Forths, and in contrast to Postscript. To make text interpretation speed bearable, the dictionary is implemented as a hash table with external chaining. There is support for portably generating CODE words (called verbs in the paper). The syntax is a bit more complicated than today: special characters may only come as first character in a word, so words are not only separated by spaces. The system already features multitasking (round-robin, with preemption). Source is stored in screens (then called sheets) containing 50 lines by 40 characters. The programs look markedly different than today because the primary stack manipulation words are @T (similar to PICK) and =T (similar to a word sometimes called STICK). Forth was running on the IBM~1130 and the Burroughs B-5500 (different cell sizes). The paper also observes that compactness of programs ``arises through the economies of tailoring definitions to a specific application'', and is more pronounced in larger programs." } @TechReport{ans96rfi7, author = {TC X3J14}, title = {Clarifying the distinction between ``immediacy'' and ``special compilation semantics''}, institution = {ANSI TC X3J14}, year = {1996}, type = {RFI response}, number = {X3J14/Q0007R} } @TechReport{ans99rfi-state, author = {TC X3J14}, title = {Regarding compilation while in Interpretation state}, institution = {ANSI TC X3J14}, year = {1999}, type = {RFI response}, number = {Q99-027}, url = {http://www.minerva.com/x3j14/queries/a99-027.txt} } @Article{goodwin&wilken96, author = {David W. Goodwin and Kent D. Wilken}, title = {Optimal and Near-optimal Global Register Allocation Using 0-1 Integer Programming}, journal = spe, year = {1996}, volume = {26}, number = {8}, month = august, pages = {929--965}, annote = {Describe global (intraprocedural) register allocation (with rematerialization, without live range splitting) as integer programming problem, and use a solver to get an optimal solution. The formulation as interger problem is pretty straight-forward: There is one variable for every live range part and real register, where live ranges are divided into parts at definition, use, load, and store points; the main nontrivial point here is that store points are only necessary right after control flow splits and after the definition; load points are only necessary right before control flow joins and before uses. The paper also presents very encouraging results on the SPECint92 benchmarks for the Precision Architecture. The spill overhead is reduced dramatically, resulting in a 0\%--10\% speedup over GCC's original register allocator and a Briggs-style allocator. The register allocation times are quite long, but not exponential; empirically, they show $n^3$-complexity with respect to the number of instructions. The register allocation times can be reduced by an order of magnitude with little degradation in allocation quality.} } @InProceedings{ruttenberg+96, author = {John Ruttenberg and G. R. Gao and A. Stoutchinin and W. Lichtenstein}, title = {Software Pipelining Showdown: Optimal vs. Heuristic Methods in a Production Compiler}, crossref = {sigplan96}, pages = {1--11}, annote = {The heuristic software pipeliner of the MIPSpro compiler for the R8000 is compared to the optimal MOST software pipeliner on the SPECFP and other benchmarks. The MIPSpro pipeliner is based on modulo scheduling. Surprisingly, the MIPSpro compiler performs as well as MOST for the scheduling itself, and due to better modeling (memory bank contention), it outperforms MOST overall. The authors state that there is still work to be done on loops with low trip counts.} } @InProceedings{lee&leone96, author = {Peter Lee and Mark Leone}, title = {Optimizing ML with Run-Time Code Generation}, crossref = {sigplan96}, pages = {137--148}, annote = {They compile curried functions into code generators that generate specialized functions when applied to the first part of the arguments. The code generators are very fast, at 4--6 executed instructions per generated instruction. As a result, the break-even for using this feature for optimization can occur very early. They present results on a few benchmarks, mainly matrix multiplication and packet filtering. The break-evens are at $20 \times 20$ dense matrices (better for sparse matrices) and at 250 packets, the asymptotic speedups are impressive (about 1.7 for dense matrices, 8 for sparse matrices, and for packet filtering 1.5 over the Berkeley C code.} } @InProceedings{eichenberger&davidson96, author = {Alexandre E. Eichenberger and Edward S. Davidson}, title = {A Reduced Multipipeline Machine Description that Preserves Scheduling Constraints}, crossref = {sigplan96}, pages = {12--22}, annote = {Representing scheduling constraints as finite state machines works for list scheduling, but not for algorithms (like most modulo scheduling schemes) that do not schedule cycle-by-cycle. In general, scheduling constraints can be represented by resource reservation tables. This paper describes how to reduce straight-forward reservation tables to smaller reservation tables that represent the same scheduling constraints.} } @InProceedings{bruggeman+96, author = {Carl Bruggeman and Oscar Waddel and R. Kent Dybvig}, title = {Representing Control in the Presence of One-Shot Continuations}, crossref = {sigplan96}, pages = {99--107}, annote = {Discuss how to represent multi-shot continuations and one-shot continuations in a stack-based Scheme implementation. One-shot continuations are programmer-specified (with the \emph{call/1cc} call). They offer a small (about 13\% in the threads benchmarks) performance benefit over multi-shot continuations, if they are applicable.} } @InProceedings{engler96, author = {Dawson R. Engler}, title = {\textsc{vcode}: A Retargetable, Extensible, Very Fast Dynamic Code Generation System}, crossref = {sigplan96}, pages = {160--170}, annote = {Describes a low-level interface and system for dynamic code generation. Code is generated for a function at a time. The interface provides essentially commands for generating code for an abstract RISC architecture; it also helps with the calling convention and with register allocation, i.e., all basic machine dependences. Code is produced separately for each virtual instruction, resulting in high code generation speed (10 instructions per generated instruction), but low run-time performance. Run-time performance can be achieved by using machine-specific knowledge and features (e.g., there are features for delay slot filling).} } @InProceedings{auslander+96, author = {Joel Auslander and Matthai Philipose and Craig Chambers and Susan J. Eggers and Brian N. Bershad}, title = {Fast, Effective Dynamic Compilation}, crossref = {sigplan96}, pages = {149--159}, annote = {Describes a compiler for C with annotations for dynamic code generation for specialization to certain run-time constants. The paper discusses mainly the analysis necessary to determine the run-time constants and the control-flow. The run-time code generator optimizes quite a bit, and also has some inefficiencies, leading to relatively slow run-time code generation, late break-even points, but good asymptotic speedups. The annotations are not safe, i.e., a program can be broken by adding the wrong annotations.} } @InProceedings{ramsey96, author = {Norman Ramsey}, title = {Relocating Machine Instructions by Currying}, crossref = {sigplan96}, pages = {226--236}, annote = {The paper describes how to derive the relocation information and relocation functions for a machine automatically from a description of the machine code, eliminating the machine-dependence and redundant information present in traditional linkers. The linker and object files can be seen as the residual program of a partially evaluated assembler (the locations of some symbols are still missing). There is no need to have a residual function for each relocation point, there is just a closure for each relocation point, and all closures share a few functions. These functions correspond to traditional linker transformations, and the closures to traditional relocation items in object files.} } @InProceedings{jagannathan&wright96, author = {Suresh Jagannathan and Andrew Wright}, title = {Flow-directed Inlining}, crossref = {sigplan96}, pages = {193--205}, annote = {Describes inlining in a larger context: flow analysis, the selection of inlining sites, and simplification are described in detail, and empirical results are given.} } @InProceedings{ramalingam96, author = {G. Ramalingam}, title = {Data Flow Frequency Analysis}, crossref = {sigplan96}, pages = {267--277}, annote = {Extends conventional data flow analysis from qualitative information to quantitative information about the frequency and/or the probability of facts. A theoretical paper.} } @Proceedings{sigplan96, booktitle = "SIGPLAN '96 Conference on Programming Language Design and Implementation", title = "SIGPLAN '96 Conference on Programming Language Design and Implementation", year = "1996", key = "PLDI '96" } @Article{proebsting95toplas, author = "Todd A. Proebsting", title = "{BURS} Automata Generation", journal = toplas, year = "1995", volume = "17", number = "3", pages = "461--486", month = may, annote = "The journal version of \cite{proebsting92}." } @InProceedings{wendt90, author = "Alan L. Wendt", title = "Fast Code Generation Using Automatically-Generated Decision Trees", pages = "9--15", booktitle = "SIGPLAN '90 Conference on Programming Language Design and Implementation", year = "1990", annote = "Describes code generation based on DAG rewriting. The code generators described in the paper are particularly fast, because the code generator generator combines several rules into profitable combinations in a preprocessing step. A training run is used to determine which combinations play a role in practice. This optimization halves the number of rule applications, and even the number of rules is slightly reduced." } @Article{briggs&torczon93, author = "Preston Briggs and Linda Torczon", title = "An Efficient Representation for Sparse Sets", journal = "ACM Letters on Programming Languages and Systems", year = "1993", volume = "2", number = "1--4", pages = "59--69", annote = "Describes a set representation that is asymptotically more time-efficient than the classical bit-vector for operations like \emph{clear-set} and \emph{forall}, but needs much more memory. The paper also presents empirical data from micro-benchmarks that shows that the \emph{member}, \emph{add-member} and \emph{delete-member} operations are about three times slower on an RS/6000 with the new representation than with the bit-vector representation. It also presents empirical data from compilations of several routines, that shows that the new representation significantly reduces the register allocation time for these routines." } @Article{appel94toplas, author = "Andrew W. Appel", title = "Axiomatic Bootstrapping: A Guide for Compiler Hackers", journal = toplas, year = "1994", volume = "16", number = "6", pages = "1699-1718", month = nov, annote = "Enhances the T-diagram formalism with constraints (called axioms in the paper), which allow a more precise and less restrictive description of boot-strapping cross-compilation than opaque T-diagrams. This approach is explained with examples from SML/NJ compilation. However, the paper did not convince me that using this formalism makes solving such problems easier." } @InProceedings{olukotun+96, author = "Kunle Onlukotun and Basem A. Nayfeh and Lance Hammond and Ken Wilson and Kunyung Chang", title = "The Case for a Single-Chip Multiprocessor", crossref = "asplos96", pages = "2--11", annote = "Present a convincing argument why doing wider issue is not very cost-effective: The R5000 (single-issue for integers) has 70\% of the SPECints of the 4-issue R10000. They propose multiple processors per chip as an alternative and do an empirical comparison of a $4\times2$-way multiprocessor with separate primary caches and a shared on-chip secondary cache against a 6-way superscalar with the same amount of caches." } @InProceedings{grunwald&neves96, author = "Dirk Grunwald and Richard Neves", title = "Whole-Program Optimization for Time and Space Efficient Threads", crossref = "asplos96", pages = "50--59", annote = "Presents two optimizations for threads: \begin{itemize}\item The stack space needed for a thread is computed from the call graph of the thread (back edges, i.e., recursion is handled by allocating a new stack segment when crossing the edge). The resulting stack segments are usually much smaller than with the original policy of starting with a page-sized segment, resulting in fewer TLB misses. \item For cooperative context switches, the context switch overhead is reduced by saving and restoring only registers that are live at that point in the respective threads.\end{itemize} They present empirical results that show speedups of 12\%--21\% for the stack space optimization, 1\%--15\% for the context switch optimization and 15\%--35\% for the combination." } @InProceedings{philbin+96, author = "James Philbin and Jan Edler and Otto J. Anshus and Craig C. Douglas and Kai Li", title = "Thread Scheduling for Cache Locality", crossref = "asplos96", pages = "60--71", annote = "Multi-Threading can improve performance by allowing the scheduler to order the threads in a cache-conscious way. This paper presents a cache-conscious thread scheduling algorithm and empirical results for several applications." } @InProceedings{chen+96asplos, author = "Peter M. Chen and Wee Teck Ng and Subhachandra Chandra and Christopher Aycock and Gurushankar Rajamani and David Lowell", title = "The Rio File Cache: Surviving Operating System Crashes", crossref = "asplos96", pages = "74--83", annote = "Battery-backed and write-protected RAM is just as persistent as disks, but faster. So, it can be used as a write-back cache that provides as much reliability as synchronuous writes provide now, with substantially improved performance. They performed crash tests that indicate that such a cache without write-protection is almost as reliable as a write-through system and with protection such a cache is more reliable. The performance is similar to a memory file system, is much better than systems with delayed write-through (while having better reliability), and the advantage over synchronuous write-through systems is even higher." } @InProceedings{mckinley&temam96, author = "Kathryn S. McKinley and Olivier Temam", title = "A Quantitative Analysis of Loop Nest Locality", crossref = "asplos96", pages = "94--104", annote = "An empirical check of popular assumptions of locality characteristics of numerical programs (the Perfect Benchmarks), inparticular within loops. The results are that loop nests have different locality characteristics than the whole program and that some popular assumptions do not hold." } @InProceedings{huang&shen96, author = "Andrew S. Huang and John Paul Shen", title = "The Intrinsic Bandwidth Requirements of Ordinary Programs", crossref = "asplos96", pages = "105--114", annote = "This paper proposes using perfect caches of various sizes to estimate the main memory bandwidth requirements of specific programs. It performs such measurements for processors with various issue widths. Not surprisingly, different benchmarks have different bandwidth spectra (in some cases, even the instruction bandwidth spectrum is data dependent), and most of the time the bandwidth requirements increase linearly with issue rate (although the authors see it differently)." } @InProceedings{seznec+96, author = "Andre\'e Seznec and St\'ephan Jourdan and Pascal Sainrat and Pierre Michaud", title = "Multiple-Block Ahead Branch Predictors", crossref = "asplos96", pages = "116--127", annote = "With increasing ILP, processors will have to predict not only the next branch, but later branches, to avoid a branch processing bottleneck. This paper describes and evaluates a mechanism for predicting two branches ahead." } @InProceedings{chen+96, author = "I-Cheng K. Chen and John T. Coffey and Trevor N. Mudge", title = "Analysis of Branch Prediction via Data Compression", crossref = "asplos96", pages = "128--137", annote = "Explains two-level branch predictors as special cases of a data compression scheme (prediction by partial matching) and compares them with a predictor derived from an optimal version of that compression scheme. The current schemes are already close to this optimal predictor." } @InProceedings{lipasti+96, author = "Mikko H. Lipasti and Christopher B. Wilkerson and John Paul Shen", title = "Value Locality and Load Value Prediction", crossref = "asplos96", pages = "138--147", annote = "Starts with the surprising observation that more than half of the dynamic loads (on both PPC and Alpha) get the same value that they loaded the last time that static load was executed. They exploit this fact with a mechanism that reduces load latency for correctly predicted values to zero (with a one-cycle penalty on misprediction). They also propose a mechanism that completely avoids an access to the memory hierarchy for highly predictable loads, but this mechanism does not look very cost-effective to me. They show how their mechanisms could be integrated into the 21164 and into the PPC~620, and that they (in their simple forms) would provide a speedup of 6\% for the 21164 and 3\% for the PPC~620." } @InProceedings{romer+96, author = "Theodore H. Romer and Dennis Lee and Geoffrey M. Voelker and Alec Wolman and Wayne A. Wong and Jean-Loup Baer and Brian N. Bershad and Henry M. Levy", title = "The Structure and Performance of Interpreters", crossref = "asplos96", pages = "150--159", annote = "They analyse the behaviour of four interpreters: MIPSI, Java, Perl, and Tcl; None of these interpreters (except, perhaps, MIPSI) seems to be written for high performance (the slowdown over C for the DES benchmark starts at a factor of 60 for MIPSI and is at 4000 for Tcl). MIPSI has a low I-cache miss rate, the I-cache misses in Java are due to library calls, and Perl and Tcl have high I-cache miss rates due to their large virtual machines. The authors conclude that interpreters have no significant differences from other applications and that there is no need for architectural support for them. They should rather be improved at the software level." } @InProceedings{luk&mowry96, author = "Chi-Keung Luk and Todd C. Mowry", title = "Compiler-Based Prefetching for Recursive Data Structures", crossref = "asplos96", pages = "222--233", annote = "Proposes and evaluates three approaches for prefetching pointer-based data structures: \emph{Greedy prefetching} prefetches from all pointers in the node when the node is visited. \emph{History-pointer prefetching} uses an additional pointer in each node for each kind of walkthrough, that points to the node that was fetched $n$ nodes later on the last walk, and prefetches from that pointer. \emph{data-linearization prefetching} puts the nodes in an array and prefetches the node that is $n$ nodes further in the array; it works only if the nodes are walked in the same order in which they were built. They implemented greedy prefetching in a compiler, and applied the other methods manually. Greedy prefetching has little effect on most benchmarks, but has a significant effect on the three benchmarks that have a high fraction of load stalls (up to 31\% reduction in cycles). History-pointer prefetching was even more effective for the benchmark for which it was used (51\% reduction in cycles). Data-linearization prefetching also outdoes greedy prefetching for the two benchmarks where it was applied (19\%--22\% fewer cycles than the original)." } @Proceedings{asplos96, title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-VII)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-VII)", year = "1996", key = "ASPLOS-VII" } @MastersThesis{pirker95, author = {Christian Pirker}, title = {{{\"U}bersetzung von Forth in Maschinensprache}}, school = {{Technische Universit\"{a}t Wien}}, type = {Diplomarbeit}, year = {1995}, address = {Austria}, url = {http://www.complang.tuwien.ac.at/Diplomarbeiten/pirker95.ps.gz}, note = {In German}, abstract = {Forth is an extensible and interactive language. It provides two programmer-visible stacks (data and returnstack). The supplied instructions (words) manipulate data on the stacks. The efficiency of the stack access and control flow determine mainly the performance of Forth implementations.\par This thesis builds a compiler that generates native code for {\em MIPS RISC processors}. The compiler translates Forth programs into native code using state of the art compiler technology. The code is directly executable on the processor.\par The compiler generates a {\bf data flow graph} for each basic block of the program. Then simple {\bf instruction selection}, {\bf instruction scheduling} and {\bf register allocation} algorithms produce the native code. The algorithms try to reduce the stack operations and eliminate unneccesary stack pointer updates.\par The native code compiler is written in Forth and can compile itself. The compiler is integrated into the interpreter. Therefore it also handles interpreter words.\par Forth programs compiled by this compiler run about 13 to 196 \% faster compared to interpreted programs. Currently compiling takes 220 \% longer than compiling into interpreting code.} } @InProceedings{gloy+95, author = "Nicolas Gloy and Michael D. Smith and Cliff Young", title = "Performance Issues in Correlated Branch Prediction Schemes", crossref = "micro95", pages = "3--14", annote = "Evaluates the effect of static correlated branch prediction (and its code expansion), code layout and branch alignment on I-cache misses, branch mispredictions and branch misfetches. The code expansion of static correlation is significant, especially the expansion of the portion that is actually executed. The I-cache miss rate is reduced significantly by code layout, and increases slightly with increasing history length in static correlation. Branch prediction accuracy increases a little from static correlation, but dynamic correlation (with gshare) is usually better. The combination of the optimizations is measured for three machine models, using a \emph{cycles saved per 1000 instructions} metric. The best results are usually achieved with modest history lengths, sometimes even with 0 history (i.e., no static correlation, only code layout and branch alignment). The combination of the optimizations outperforms gshare without such compiler support on 21164-like and PA-8000-like machine models." } @InProceedings{nair95, author = "Ravi Nair", title = "Dynamic Path-Based Branch Prediction", crossref = "micro95", pages = "15--23", annote = "Compares dynamic path-based branch prediction with dynamic pattern-based branch prediction. At the same hardware cost, there is little difference for long flush intervals (flushing was used to simulate context switching effects). But path-based schemes have a shorter training time for the same prediction accuracy, and therefore perform a little better at high flush rates." } @InProceedings{calder+95, author = "Brad Calder and Dirk Grunwald and Amitabh Srivastava", title = "The predictability of Branches in Libraries", crossref = "micro95", pages = "24--34", annote = "Finds the following: The (branching) behaviour of library code in one program can be predicted well by profiling other programs that make significant use of the library. Optimizing the library with this information can improve the performance of application programs significantly (depending on the amount of time that the application spends in the library), without additional cost to the application programmer. Using such optimized libraries with a profile-optimized main program is almost as good as profile-optimizing the program comlete with the library. The heuristics of \cite{ball&larus93} sometimes fail miserably (for the Digital Unix libm in this case)." } @Proceedings{micro95, title = "International Symposium on Microarchitecture (MICRO-28)", booktitle = "International Symposium on Microarchitecture (MICRO-28)", year = "1995", key = "MICRO-28" } @InProceedings{Adl-Tabatai96, author = "Ali-Reza Adl-Tabatabai and Thomas Gross and Guei-Yuan Lueh", title = "Code Reuse in an Optimizing Compiler", crossref = "oopsla96", pages = "51--68", annote = "Present some ways to reuse code in a compiler. Code examples in C++ are given. Not very spectacular, but, e.g., in register allocation this paper is the AFAIK first publication that gives a systematic view of the commonalities and differences of various register allocation methods. One nice feature of the compiler is that the same compiler binary can generate code for different architectures. The mapping of everything to classes which could just as well (or better) be expressed with conventional programming constructs (e.g., procedure variables) reinforces my impression that much of the OO stuff is just hype." } @InProceedings{dean+96, author = "Jeffrey Dean and Greg DeFouw and David Grove and Vassily Litvinov and Craig Chambers", title = "Vortex: An Optimizing Compiler for Object-Oriented Languages", crossref = "oopsla96", pages = "83--100", annote = "Presents an intermediate code and optimizing compiler back-end for object oriented languages, combined with front-ends for several languages. The intermediate representation represents object-oriented concepts directly, not through lower-level concepts. This allows better optimization of object-oriented concepts. These optimization achieve a speedup of up to 10 for Cecil programs, and 10\%--30\% for Java, C++, and Modula-3 programs. Also contains other interesting empirical data about the benchmarks they used." } @InProceedings{moore96, author = "Ivan Moore", title = "Automatic Inheritance Hierarchy Restructuring and Method Refactoring", crossref = "oopsla96", pages = "235--250", annote = "Restructures the hierarchy by first removing it (putting all attributes in all offspring classes of an eliminated class), then creates a new hierarchy by making sets of attributes into classes. Methods are refactored by a kind of common subexpression elimination. Refactoring is done over all methods, then the new hierarchy is built. The work is based on Self, which makes some of these things particularly easy. Refactoring and restructuring are applied to three hierarchies; they discovered some new classes and reduced the code size (measured as number of message sends) somewhat, but basically did not change the structure very much." } @InProceedings{diwan+96, author = "Amer Diwan and J. Eliot B. Moss and Kathryn S. McKinley", title = "Simple and Effective Analysis of Statically-Typed Object-Oriented Programs", crossref = "oopsla96", pages = "292--305", annote = "Determines the effectiveness of four methods for detecting whether a method invocation is monomorphic (always calls the same procedure): \emph{Type hierarchy analysis} uses only information present in the class and method declaration. \emph{Intraprocedural type propagation} is a data flow analysis that can determine the type of the method recipient more accurately. \emph{Aggregate analysis} determines whether a container datatype contains just data of one type. \emph{Interprocedural type propagation} used in this paper is a context-insensitive interprocedural version of type propagation. The language used in the paper is Modula-3, where NULL is a subtype of all types and overrides every method with the error method. Consequently, type hierarchy analysis can never determine that an invocation is monomorphic. For the benchmarks used, only intraprocedural and interprocedural type propagation yielded significant improvements, detecting that up to 35\% of the dynamic method invocations were monomorphic. To simulate the semantics of other languages, the authors also performed experiments that ignored the NULL type. These experiments showed type hierarchy analysis to be very effective, with aggregate analysis and interprocedural type propagation providing siginficant improvements in a few cases. Up to 95\% of all calls were detected to be monomorphic. In both variants, the whole program was assumed to be available for the analysis. The paper then looks at the causes for not being able to determine monomorphism (when ignoring NULL): In the majority of cases, the invocation was actually polymorphic. In the other cases, the reasons varied with the benchmark. The paper also discusses the reasons for polymorphic invocations and transformations that may resolve them. 50\% of all dynamic method invocations are less than 60 instructions from each other (before analysis and any optimization derived from it)." } @InProceedings{driesen&hoelzle96, author = "Karel Driesen and Urs H{\"o}lzle", title = "The Direct Cost of Virtual Function Calls in {C++}", crossref = "oopsla96", pages = "306--323", annote = "Studies the cost of virtual function calls on modern processors, taking into account the effects of out-of-order execution and caches. On their baseline architecture, the standard implementation of virtual function calls take 1\%--10\% of the instructions and 2\%--29\% of the cycles. The thunk implementation is slightly faster, for most benchmarks, and much faster for a few. The relative cost of virtual function calls will increase slightly in the future. The influences of architectural variations like branch misprediction penalties, branch prediction accuracy, issue widths, and load latency. The cost per dispatch is 2--5 cycles for most benchmarks (10 cycles for one benchmark) on their baseline architecture (4-wide processor, 2 cycle load latency, 4 cycle branch latency)." } @InProceedings{bacon&sweeney96, author = "David F. Bacon and Peter F. Sweeney", title = "Fast Static Analysis of C++ Virtual Function Calls", crossref = "oopsla96", pages = "324--341", annote = "An empirical study of the effectiveness of \emph{unique name} analysis, \emph{class hierarchy analysis} and \emph{rapid type analysis}. Rapid type analysis is an analysis that takes into account which classes are actually instantiated. It is very fast and detects a a significant percentage of monomorphic calls for some benchmarks (up to 100\%)." } @Proceedings{oopsla96, title = "Conference on Object-Oriented Programming Systems, Languages \& Applications (OOPSLA '96)", booktitle = "Conference on Object-Oriented Programming Systems, Languages \& Applications (OOPSLA '96)", year = "1996", key = "OOPSLA '96", } @InProceedings{kessler96, author = {Christoph W. Ke{\3}ler}, title = {Scheduling Expression DAGs for Minimal Register Need}, booktitle = {Programming Languages: Implementations, Logics, and Programs (PLILP'96)}, series = {LNCS 1140}, year = {1996}, publisher = {Springer}, pages = {228--242}, annote = {A dynamic programming algorithm for basic-block scheduling for register allocation.} } @Article{steenkiste&hennessy89, author = {Peter A. Steenkiste and John L. Hennessy}, title = {A Simple Interprocedural Register Allocation Algorithm and Its Effectiveness for {LISP}}, journal = toplas, year = {1989}, volume = {11}, number = {1}, month = jan, pages = {1--32} } @Misc{sharnoff&robenalt, author = {David Muir Sharnoff and Steven Allen Robenalt}, title = {Catalog of compilers, interpreters, and other language tools}, howpublished = {http://www.idiom.com/free-compilers} } @Article{ivanco&hunter90, author = {Tyler A. Ivanco and Geoffry Hunter}, title = {A User Definable Language Interface for {Forth}}, journal = jfar, year = {1990}, volume = {6}, number = {1} } @Article{rodriguez&poehlman96, author = {Bradford J. Rodriguez and W. F. S. Poehlman}, title = {A Survey of Object-Oriented {Forths}}, journal = sigplan, year = {1996}, month = apr, pages = {39--42}, url = {http://www.zetetics.com/bj/papers/oofs.htm} } @Article{mckewan97, author = {Andrew McKewan}, title = {Object-Oriented Programming in {ANS Forth}}, journal = {Forth Dimensions}, year = {1997}, month = mar } @InProceedings{gough97, author = {K. John Gough}, title = {Multi-Language, Multi-Target Compiler Development: Evolution of the Gardens Point Compiler Project}, crossref = {jmlc97}, pages = {17--40}, annote = {Describes a relatively recent compiler framework that uses a stack-based intermediate code (Dcode).} } @Proceedings{jmlc97, title = {Modular Programming Languages (JMLC '97)}, booktitle = {Modular Programming Languages (JMLC '97)}, year = {1997}, key = {JMLC'97}, volume = {1204}, series = {LNCS}, publisher = {Springer} } @Article{moore87, author = {Charles Moore}, title = {Forth -- eine pers{\"o}nliche Sprache}, journal = {Vierte Dimension}, year = {1987}, volume = {3}, number = {3}, month = oct, pages = {11--13}, note = {Translated into German by Klaus Schleisiek, the original is probably in \emph{More on NC4000}} } @Book{stroustroup94, author = {Bjarne Stroustroup}, title = {The Design and Evolution of {C++}}, publisher = {Addison-Wesley}, year = {1994}, annote = {An extended version of \cite{stroustroup93}. Discusses the design philosophy of C++ and the specific design issues in detail.} } @Article{oconnor&tremblay97, author = {J. Michael O'Connor and Marc Tremblay}, title = {PicoJava-I: The {Java} Virtual Maschine in Hardware}, journal = {IEEE Micro}, year = {1997}, month = mar, pages = {45--53}, annote = {The PicoJava has a 4-stage single-issue pipeline, a 64-entry stack cache, and can execute some load instructions together with compute instructions.} } @Article{christie96, author = {Dave Christie}, title = {Developing the {AMD-K5} Architecture}, journal = {IEEE Micro}, year = {1996}, month = apr, pages = {16--26} } @Article{kessler&rauber95, author = {Christoph W. Ke{\ss}ler and Thomas Rauber}, title = {Generating Optimal Contiguous Evaluations for Expression {DAG}s}, journal = {Computer Languages}, year = {1995}, volume = {21}, number = {2}, pages = {113--127} } @InProceedings{aiken&nicolau88esop, author = {Alexander Aiken and Alexandru Nicolau}, title = {Perfect Pipelining}, booktitle = {European Symposion on Programming (ESOP '88)}, volume = {300}, series = {LNCS}, year = {1988}, publisher = {Springer}, url = {http://theory.stanford.edu/~aiken/publications/papers/esop88.ps}, pages = {221--234} } @InProceedings{aiken&nicolau88pldi, author = {Alexander Aiken and Alexandru Nicolau}, title = {Optimal Loop Parallelization}, booktitle = {SIGPLAN '88 Conference on Programming Language Design and Implementation}, year = {1988}, pages = {308--317} } @Article{wirth88, author = {Niklaus Wirth}, title = {From {Modula} to {Oberon}}, journal = spe, year = {1988}, volume = {18}, number = {7}, month = jul, pages = {661--670} } @Book{gabriel96, author = {Richard P. Gabriel}, title = {Patterns of Software}, publisher = {Oxford University Press}, year = {1996}, annote = {A collection of essays on various topics, including Christopher Alexander, languages, an autobiography, and the story of Lucid, Inc.} } @Article{zsoter96, author = {Andr{\'a}s Zs{\'o}ter}, title = {Does Late Binding Have to be Slow?}, journal = {Forth Dimensions}, year = {1996}, volume = {18}, number = {1}, pages = {31--35}, url = {http://www.forth.org/oopf.html} } @Article{paysan94, author = {Bernd Paysan}, title = {Object Oriented {bigFORTH}}, journal = {Vierte Dimension}, year = {1994}, volume = {10}, number = {2}, note = {An implementation in ANS Forth is available at http://www.jwdt.com/~paysan/oof.zip} } @Book{pountain87, author = {Dick Pountain}, title = {Object-Oriented {Forth}}, publisher = {Academic Press, London}, year = {1987} } @Book{krishnamurthy95, title = "Practical Reusable {UNIX} Software", publisher = "John Wiley \& Sons", year = 1995, editor = "Balachander Krischnamurthy" } @Article{balachandran+90, author = {A. Balachandran and D. M. Dhamdhere and S. Biswas}, title = {Efficient Retargetable Code Generation Using Bottom-Up Tree Pattern Matching}, journal = {Computer Languages}, year = {1990}, volume = {15}, number = {3}, pages = {127--140} } @InProceedings{nair&hopkins97, author = {Ravi Nair and Martin E. Hopkins}, title = {Exploiting Instruction Level Parallelism in Processors by Caching Scheduled Groups}, crossref = {isca97}, pages = {13--25}, annote = {Proposes a microarchitecture consisting of a simple, slow engine and a parallel engine. The simple, slow engine is used for the execution of traces not contained in the DIF (Dynamic Instruction Formatting) cache and for hard problems (exceptions, memory disambiguation mispredicitions); a VLIW-like parallel engine is used for quickly executing code in the DIF cache under the usual circumstances. Evaluates this idea with simulations and presents some interesting results; in particular, already a small DIF cache (256--1024 entries) provides good results.} } @InProceedings{ebcioglu&altman97, author = {Kemal Ebcio\u{g}lu and Erik Altman}, title = {{DAISY}: Dynamic Compilation for 100\% Architectural Compatibility}, crossref = {isca97}, pages = {26--37}, annote = {Exploits significant amounts of instruction-level parallelism by using a VLIW, and translating code for existing architectures pagewise using a fast scheduler. The VLIW has special hardware support for this scheme.} } @InProceedings{hakura&gupta97, author = {Ziyad S. Hakura and Anoop Gupta}, title = {The Design and Analysis of a Cache Architecture for Texture Mapping}, crossref = {isca97}, pages = {108--120}, annote = {Uses a cache for the texture memory in MIP-mapped texture mapping and analyses the performance. A cache of 16K gives good performance, reducing the memory bandwidth required by 3--15 times, especially when the texture mapping is performed in a tiled order.} } @InProceedings{wilson&olukotun97, author = {Kenneth M. Wilson and Kunle Olukotun}, title = {Designing High Bandwidth On-Chip Caches}, crossref = {isca97}, pages = {121--132}, annote = {Varies the cache size, organization, pipeline depth, and models the resulting cycle time, IPC, and overall execution time.} } @InProceedings{moshovos+97, author = {Andreas Moshovos and Scott E. Breach and T.N. Vijaykumar and Gurindar S. Sohi}, title = {Dynamic Speculation and Synchrinization of Data Dependences}, crossref = {isca97}, pages = {181--193}, annote = {Shows that blindly speculating on the independence of memory accesses can hurt performance and proposes a solution for the problem.} } @InProceedings{sodani&sohi97, author = {Avinash Sodani and Gurindar S. Sohi}, title = {Dynamic Instruction Reuse}, crossref = {isca97}, pages = {194--205}, annote = {Evaluates various ways to reuse the results of instructions that were executed with the same arguments. Such instructions arise from throwing away instructions upon branch misprediction, whether the instrcutions depended on the branch or not; such instructions also arise from executing code several times with (partially) the same arguments. Several schemes for reuse are proposed, some depending on the data flow, some on the actual values. A significant percentage of the instructions is reused with the most aggressive scheme, resulting in speedups of 4\%--15\% (harmonic mean over all benchmarks) for the various schemes.} } @InProceedings{palacharla+97, author = {Subbarao Palacharla and Norman P. Jouppi and J. E. Smith}, title = {Complexity-Effective Superscalar Processors}, crossref = {isca97}, pages = {206--218}, annote = {Estimates the cycle time of certain critical (non-pipelinable) components of an OOO superscalar processor at verious feature sizes and for various degrees of superscalarity. For a 8-issue superscalar at 0.18$\mu$ the critical components are the bypass logic and the wakeup and select logic. They then propose a microarchitecture that avoids this bottleneck: they partition the functional units into two 4-issue clusters (with a one-cycle delay for intercluster bypassing to avoid the bypass bottleneck and schedule the instructions into FIFOs of (perferably) dependent instructions to avoid the wakeup and select bottleneck. These changes have a small negative effect on the IPC, but a large positive effect on the (potential) cycle time, resulting in an average improvement of 16\% in speed.} } @InProceedings{chang+97, author = {Po-Yung Chang and Eric Hao and Yale N. Patt}, title = {Target Prediction for Indirect Jumps}, crossref = {isca97}, pages = {274--283}, annote = {Apply the idea of two-level branch prediction to predicting the targets of indirect jumps (i.e., it adds (conditional) branch history to the target address for accessing the target cache. For the SPECint95 benchmarks involving many indirect jumps, perl and gcc, this results in a prediction accuracy of 93\% and 63\% and an execution time improvement of 14\% and 5\%.} } @InProceedings{sprangle+97, author = {Eric Sprangle and Robert S. Chappell and Mitch Alsup and Yale N. Patt}, title = {The Agree Predictor: A Mechanism for Reducing Negative Branch History Interference}, crossref = {isca97}, pages = {284--291}, annote = {Reduces the number of conflict mispredictions by having the predictor entries predict whether or not some other predictor (say, a static predictor) is correct. This increases the chance that the predicted direction is correct in case of a conflict.} } @Proceedings{isca97, title = "$24^\textit{th}$ Annual International Symposium on Computer Architecture", booktitle = "$24^\textit{th}$ Annual International Symposium on Computer Architecture", year = "1997", key = "ISCA 24", } @Article{fraser&henry91, author = {Christopher W. Fraser and Robert R. Henry}, title = {Hard-Coding Bottom-Up Code Generation Tables to Save Time and Space}, journal = spe, year = {1991}, volume = {21}, number = {1}, month = jan, pages = {1--12}, annote = {Describes how to optimize tree parsing automata. Some of these optimizations nowadays appear to trade too much time or complexity for space, but others save both time and space.} } @InProceedings{deutsch&schiffman84, author = {L. Peter Deutsch and Allen M. Schiffman}, title = {Efficient Implementation of the {Smalltalk-80} System}, booktitle = {Principles of Programming Languages (POPL'84)}, year = {1984}, pages = {297--302} } @InProceedings{proebsting97, author = "Todd A. Proebsting", title = "Simple Translation of Goal-Directed Evaluation", crossref = "sigplan97", pages = "1--6", annote = "Presents a method for translating expressions involving backtracking into simple goto-based code, without creating choicepoints. However, it does not discuss (in depth) how to translate backtracking procedures." } @InProceedings{collberg97, author = "Christian S. Collberg", title = "Reverse Interpretation $+$ Mutation Analysis = Automatic Retargeting", crossref = "sigplan97", pages = "57--70", annote = "Finds out how to generate code for an architecture by automatic reverse engineering of the code generator of a C compiler. In particular, the system uses the C compiler to generate assembly language; it feeds a large number of relatively simple C programs to the C compiler. First it discovers the assembler's syntax (this only works for conventional syntaces), then the meaning of the instructions (using \emph{mutation analysis}, which analyses the differences in the output (if any) for slightly varying inputs), and generates a BEG code generator from this. The system has been tested on the integer instruction set of five register machines." } @InProceedings{ammons+97, author = "Glenn Ammons and Thomas Ball and James R. Larus", title = "Exploiting Hardware Performance Counters with Flow and Context Sensitive Profiling", crossref = "sigplan97", pages = "85--96", annote = "Introduces a clever way to do efficient profiling over (acyclic) paths. It also introduces context-sensitive profiling, which is based on the calling context tree, a slightly more accurate representation of the calling behaviour than the call graph; it allows to accurately attribute metrics to callers (in the absence of recursion). Also presents some results of measurements for various metrics (e.g., cache misses)." } @InProceedings{clinger&hansen97, author = "William D. Clinger and Lars T. Hansen", title = "Generational Garbage Collection and the Radioactive Decay Model", crossref = "sigplan97", pages = "97--108", annote = "Shows that generational garbage collection can help even if the objects' life expectancy is independent of their age (i.e., the death rate is constant): Just collect a part that has not being collected for the longest time. The paper also presents some data on the life-time of objects in real programs." } @InProceedings{poletto+97, author = "Massimiliano Poletto and Dawson R. Engler and M. Frans Kaashoek", title = "\textsf{tcc}: A System for Fast, Flexible, and High-Level Dynamic Code Generation", crossref = "sigplan97", pages = "109--121", annote = "Introduces `C, an extension of ANSI C for dynamic code generation, and tcc, a compiler for `C (current targets: SPARC and MIPS). Also presents some results." } @InProceedings{goodwin97, author = "David W. Goodwin", title = "Interprocedural Dataflow Analysis in an Executable Optimizer", crossref = "sigplan97", pages = "122--133", annote = "Describes a fast interprocedural optimizer that works on executables. It first summarizes register information for procedures, then uses that information for various optimizations." } @InProceedings{ayers+97, author = "Andrew Ayers and Robert Gottlieb and Richard Schooler", title = "Aggressive Inlining", crossref = "sigplan97", pages = "134--145", annote = "Discusses the inlining and cloning performed by HP's compiler. These components are controlled by giving them a budget (a compile-time-increase by 100\%), which they can use up in several passes, inlining only the most promising candidates. Cloning alone has little effect, inlining has a significant effect (factor 1.3 on SPECint95), and the combination is slightly better." } @InProceedings{hashemi+97, author = "Amir H. Hashemi and David R. Kaeli and Brad Calder", title = "Efficient Procedure Mapping Using Cache Line Coloring", crossref = "sigplan97", pages = "171--182", annote = "Another method for reducing I-cache misses." } @InProceedings{eichenberger&davidson97, author = "Alexandre E. Eichenberger and Edward S. Davidson", title = "Efficient Formulation for Optimal Modulo Schedulers", crossref = "sigplan97", pages = "194--205" } @InProceedings{sperber&thiemann97, author = "Michael Sperber and Peter Thiemann", title = "Two for the Price of One: Composing Partial Evaluation and Compilation", crossref = "sigplan97", pages = "215--225" } @InProceedings{bergner+97, author = "Peter Bergner and Peter Dahl and David Engebretsen and Matthew O'Keefe", title = "Spill Code Minimization via Interference Region Spilling", crossref = "sigplan97", pages = "287--295", annote = "Adds interference region spilling (similar to live range splitting) to Chaitin-style allocators. The results are encouraging." } @InProceedings{lueh&gross97, author = "Guei-Yuan Lueh and Thomas Gross", title = "Call-Cost Directed Register Allocation", crossref = "sigplan97", pages = "296--307", annote = "This paper contains enough stuff for several papers: It introduces an improvement in register assignment that differentiates between caller-saved and callee-saved registers, and in some cases (where it is beneficial) rather spills a live range than assigning a register of the wrong class; the results are good. The paper also contains an empirical evaluation of various design options for graph-colouring register allocators. " } @InProceedings{ernst+97, author = "Jens Ernst and William Evans and Christopher W. Fraser and Steven Lucco and Todd A. Proebsting", title = "Code Compression", crossref = "sigplan97", pages = "358--365", annote = "Proposes two forms of compressed code based on lcc's intermediate language: the \emph{wire code} cannot be interpreted directly and is somewhat (0\%--25\%) smaller than gzipped SPARC code; one of the reasons for this is that they separate instructions and immediate data to help compression algorithms. They also describe an interpretable code called BRISC which uses \emph{operand specialization} and \emph{opcode combination} to be compact. BRISC code size is similar to gzipped Pentium code, its interpreter is about 12 times slower than native code, and code generated by a JIT is about 2.5 times slower than native code." } @Proceedings{sigplan97, booktitle = "SIGPLAN '97 Conference on Programming Language Design and Implementation", title = "SIGPLAN '97 Conference on Programming Language Design and Implementation", year = "1997", key = "PLDI '97" } @Book{woehr92, author = "Jack Woehr", title = "Forth: The New Model", publisher = "M\&T Publishing/Prentice-Hall", year = "1992" } @Book{kelly&spies86, author = "Mahlon G. Kelly and Nicholas Spies", title = "FORTH: A Text and a Reference", publisher = "Prentice-Hall", year = "1986", note = "Available from Miller Microcomputer Services, 61 Lake Shore Road, Natick, MA, USA" } @InProceedings{grove+97, author = "David Grove and Greg DeFouw and Jeffrey Dean and Craig Chambers", title = "Call Graph Construction in Object-Oriented Languages", crossref = "oopsla97", pages = "108--124" } @InProceedings{zendra+97, author = "Olivier Zendra and Dominique Colnet and Suzanne Collin", title = "Efficient Dynamic Dispatch without Virtual Function Tables. The {SmallEiffel} Compiler.", crossref = "oopsla97", pages = "125--141", url = "http://smalleiffel.loria.fr/papers/oopsla97.ps.gz", html-part-url ="http://www.elj.com/elj-win32/ooplsa97-se-paper.html", annote = {The SmallEiffel Compiler does not use virtual function tables. Instead, it represents types with integers and uses binary search (coded as an if-tree) to find the right method for a selector. The compiler uses type analysis to determine the possible receiver types at each call site. The compiler generates a dispatch function for each (used) set of receiver types for a selector. The compiler recompiles the whole program every time, but is fast enough (5000 lines/s Eiffel-to-C on a Pentium Pro 200) to make this practical; the C compiler then recompiles only the changed parts. The paper presents some empirical results that supposedly show that the binary search outperforms the VFT approach for an unpredictable trimorphic call, and still performs better on some machines for a megamorphic (50 types) call. However, the paper assumes that rotating the targets in a fixed order makes them unpredictable, but that is not true in the presence of history-pattern-based branch predictors (e.g., PentiumPro), so don't put too much faith in the results of the polymorphic call benchmarks.} } @InProceedings{vitek+97, author = "Jan Vitek and R. Nigel Horspool and Andreas Krall", title = "Efficient Type Inclusion Tests", crossref = "oopsla97", pages = "142--157" } @Proceedings{oopsla97, title = "Conference on Object-Oriented Programming Systems, Languages \& Applications (OOPSLA '97)", booktitle = "Conference on Object-Oriented Programming Systems, Languages \& Applications (OOPSLA '97)", year = "1997", key = "OOPSLA '97", } @Book{adobe90, author = {{Adobe Systems Incorporated}}, title = {PostScript Language --- Reference Manual}, publisher = {Addison-Wesley}, year = 1990, edition = {second} } @Book{adobe86blau, author = {{Adobe Systems Incorporated}}, title = {PostScript Language --- Tutorial and Cookbook}, publisher = {Addison-Wesley}, year = 1988 } @Book{adobe88gruen, author = {{Adobe Systems Incorporated}}, title = {PostScript Language --- Program Design}, publisher = {Addison-Wesley}, year = 1988 } @Book{brodie84, author = {Leo Brodie}, title = {Thinking Forth}, publisher = {Fig Leaf Press (Forth Interest Group)}, year = 1984, address = {100 Dolores St, Suite 183, Carmel, CA 93923, USA} } @Book{brodie04, author = {Leo Brodie}, title = {Thinking Forth}, publisher = {Punchy Publishing}, year = 2004, note = {Reprint of the 1984 edition, \url{http://thinking-forth.sourceforge.net/}} } @InProceedings{brouwer+98, author = "Klaus Brouwer and Wolfgang Gellerich and Erhard Ploederer", title = "Myths and Facts about the Efficient Implementation of Finite Automata and Lexical Analysis", crossref = "cc98", pages = "1--15", annote = "Performed some measurements on Ada scanners implemented with Aflex (the Ada version of flex), REX, and several techniques imlemented in hand-written scanners. The measurements included run-time on various machines, data and instruction cache misses on qa Pentium, the effects of using optimization in the compiler. The most surprising result (for me) was the huge difference between the generated and the hand-written scanners (a factor of 74 between the fastest scanner and the Aflex-generated one on a slow Sun). There was little difference between the various hand-written scanners. The paper also discusses the reasons for the low performance of the generated scanners (mainly because they are table-driven) and suggests some improvements." } @InProceedings{johnstone&scott98, author = "Adrian Johnstone and Elizabeth Scott", title = "Generalized Recursive-Descent Parsing and Follow-Determinism", crossref = "cc98", pages = "16--30", annote = "Generalized Recursive-Descent parsing can handle all grammars that are not left-recursive (even ambiguous ones), but potentially requires exponential time; on LL(1) grammars it is linear. The technique is quite simple: each parsing function builds a set of the ends of possible matches. The paper also introduces a property \emph{follow-determinism} of grammars, which makes it possible to have linear-complexity recursive-descent parsers for more grammars than before (e.g., some grammars that cannot be left-factored, and some non-LR grammars). A generator GRDP implementing these techniques is available." } @InProceedings{boyland98, author = "John Tang Boyland", title = "Analysing Direct Non-local Dependencies in Attribute Grammars", crossref = "cc98", pages = "31--49", annote = "Presents an extension to attribute grammars that allows to have rules dealing with parts of larger data structures (e.g., parts of symbol table entries), with correct execution order etc. The main part of the paper discusses how to analyse them at compiler-generation time." } @InProceedings{knoop+98cc, author = "Jens Knoop and Dirk Kosch{\"u}tzki and Bernhard Steffen", title = "Basic-Block Graphs: Living Dinosaurs?", crossref = "cc98", pages = "63--79", annote = "Makes a case for single-instruction, edge-labeled data-flow graphs for data-flow analysis, mainly because they are simpler to analyse. Unfortunately the timings given in the paper are wrong (source: discussion with one of the authors)." } @InProceedings{martin+98, author = "Florian Martin and Martin Alt and Reinhard Wilhelm and Christian Ferdinand", title = "Analysis of Loops", crossref = "cc98", pages = "80--94", annote = "The context for this paper is data flow analysis of existing binary programs to find out the worst-case execution time (including cache effects). They improve their analysis by performing loop peeling during the analysis. This is somehow related to interprocedural analysis." } @InProceedings{lapkowski&hendren98, author = "Christopher Lapkowski and Laurie J. Hendren", title = "Extended SSA Numbering: Introducing SSA Properties to Languages with Multi-level Pointers", crossref = "cc98", pages = "128--143", annote = "Introduces a representation for data flow analysis that allows some pointer analysis. It is based on SSA numbering (similar to SSA form without $\phi$ nodes), but uses two numbers per pointer." } @InProceedings{kennedy+98, author = "Robert Kennedy and Fred Chow and Peter Dahl and Shin-Ming Liu and Raymond Lo and Mark Streich", title = "Strength Reduction via SSAPRE", crossref = "cc98", pages = "144--158", annote = "Discusses how to do strength reduction in the SSAPRE framework (partial redundancy elimination based on SSA form). The advantages of this approach are that it is more powerful than bit-vector based techniques (it can handle situations in one pass that would require a repetition of PRE and strength reduction in the bit-vector approach). OTOH, it also has some disadvantages through the absence of global information: in particular, linear function test replacement does not know all the replacement candidates, so it may choose a suboptimal replacement." } @InProceedings{ghiya+98, author = "Rakesh Ghiya and Laurie J. Hendren and Yingchun Zhu", title = "Detecting Parallelism in C Programs with Recursive Data Structures", crossref = "cc98", pages = "159--173" } @InProceedings{cooper&simpson98, author = "Keith D. Cooper and L. Taylor Simpson", title = "Live Range Splitting in a Graph Coloring Register Allocator", crossref = "cc98", pages = "174--187", annote = "This splitting approach uses the containment graph, a directed form of the interference graph: there is a directed edge from a live range $b$ to a live range $a$ if $b$ is not live at any definition or use of $a$ ($a$ contains $b$), but $a$ and $b$ conflict; in that case $a$ can be \emph{split around} $b$, and both can occupy the same register. The splitting method presented in the paper is passive, splitting only if the node would be spilled, and if splitting appears to be cheaper than spilling. The paper presents some empirical results, that show an improvement over the plain spilling approach. I find them not very convincing, due to the fact that they only use splitting where it appears better than spilling; in this setting only a technique that is dominated by the old technique (i.e., that is never better) would not show an improvement." } @InProceedings{gupta98, author = "Rajiv Gupta", title = "A Code Motion Framework for Global Instruction Scheduling", crossref = "cc98", pages = "219--233", annote = "Presents a global instruction scheduling method based on moving single instructions into delay slots, somewhat like percolation scheduling and the approach of \cite{bernstein&rodeh91}. No empirical evaluation is presented." } @InProceedings{stuempel+98, author = "Esther St{\"u}mpel and Michael Thies and Uwe Kastens", title = "{VLIW} Compilation Techniques for Superscalar Architectures", crossref = "cc98", pages = "234--248", annote = "Models the PPC~604 as VLIW processor (which presented some interesting problems) and used a compiler that uses various scheduling techniques on it. Presents empirical results." } @InProceedings{engelbrecht&kourie98, author = "R. L. Engelbrech and D. G. Kourie", title = "Issues in Translating Smalltalk to Java", crossref = "cc98", pages = "249--263", annote = "Presents some of the challenges in translating Smalltalk to Java, and how to deal with them; in particular: in Smalltalk every selector can be sent to any object; Smalltalks variables can have any type; Smalltalk's class methods have no equivalent in Java." } @InProceedings{steindl98, author = "Christoph Steindl", title = "Intermodular Slicing of Object-Oriented Programs", crossref = "cc98", pages = "264--278", annote = "Presents a slicer for Oberon programs. It uses a combination of a conservative approach and user feeback to deal with the problem of function pointers that is present at every method invocation. The slicer has been implemented and is practical, typically producing a slice in one second." } @InProceedings{petterson98, author = "Mikael Petterson", title = "Portable Debugging and Profiling", crossref = "cc98", pages = "279--293", annote = "This paper deals with the problem of producing a portable debugger for a language implementation that achieves portability by translating into C. The approach taken in the paper is to add debugging code to the C program, that's quite similar to what a C compiler with debugging turned on inserts into the machine program; the effects are also similar: 22\% slowdown and 300\% code growth (from the talk, the numbers in the paper are different); this does not give information on variable values to the debugger. This approach is also applied to profiling." } @InProceedings{leino&nelson98, author = "K. Rustan M. Leino and Greg Nelson", title = "An Extended Static Checker for Modula-3", crossref = "cc98", pages = "302--305", annote = "The checker works by taking a program (usually extended with some annotations), generating logical formulas from that, and leaving it to a theorem prover to check them. It does not prove the correctness of the program, but can find some errors." } @Proceedings{cc98, title = "Compiler Construction (CC'98)", booktitle = "Compiler Construction (CC'98)", year = "1998", key = "CC'98", editor = "Kai Koskimies", OPTvolume = "1383", OPTseries = "LNCS", publisher = "Springer LNCS~1383", address = "Lisbon" } @InProceedings{smolka98, author = "Gert Smolka", title = "Concurrent Constraint Programming Based on Functional Programming", crossref = "esop98", pages = "1--11", annote = "Adds promises and futures to ML, which allows doing things like logic variables, coroutining, and possibly attributed variables (meta-structures) in Prolog." } @InProceedings{chin+98, author = "Wei-Ngan Chin and Siau-Cheng Khoo and Tat-Wee Lee", title = "Synchronisation Analysis to Stop Tupling", crossref = "esop98", pages = "75--89", annote = "Tupling groups calls with common arguments together , so their multiple results can be computed simultaneously (in some sense it is a functional equivalent of loop fusion); it's a powerful transformation, allowing reductions in complexity. The proplem is, as usual, ensuring that the transformation stops. The paper introduces \emph{synchronization analysis} for indicating when tupling can be safely applied." } @InProceedings{jay&steckler98, author = "C. B. Jay and P. A. Steckler", title = "The Functional Imperative: Shape!", crossref = "esop98", pages = "139--153", annote = "They compile \textsf{FiSh}, a functional language for array computation into very efficient code by using \emph{shape analysis}, a kind of type analysis that includes array sizes. Another, probably equally important reason for the efficiency is that the compiler performs full inlining of everything (in particular, higher-order functions). They present empirical results showing a huge speedup over the Glasgow Haskell compiler, and speedups of 1.25--7 over ocamlopt." } @InProceedings{knoop+98esop, author = "Jens Knoop and Oliver R{\"u}thing and Bernhard Steffen", title = "Code Motion and Code Placement: Just Synonyms?", crossref = "esop98", pages = "154--169", annote = "Discusses some variations in partial redundancy elimination algorithms. Maybe a good starting point for this topic." } @InProceedings{ross&sagiv98, author = "John L. Ross and Mooly Sagiv", title = "Building a Bridge between Pointer Aliases and Program Dependences", crossref = "esop98", pages = "221--235", annote = "Presents a way to map the may-alias problem (are two references possible aliases at one point?) to the program dependence problem (does a program point possibly depend on another?)." } @InProceedings{scherlis98, author = "William L. Scherlis", title = "Systematic Change of Data Representation: Program Manipulations and a Case Study", crossref = "esop98", pages = "252--266", annote = "Presents three class manipulation transformations: \emph{Shift} moves a common computation from all wrapping (constructor) sites to all unwrapping sites or vice versa, possibly accomplishing representation changes. \emph{Project} and \emph{idempotency} do the obvious thing. The paper illustrates these techniques by deriving the Java \texttt{String} and \texttt{StringBuffer} classes from a C-like string class through program transformations." } @Proceedings{esop98, title = "Programming Languages and Systems (ESOP'98)", booktitle = "Programming Languages and Systems (ESOP'98)", year = "1998", key = "ESOP'98", OPTvolume = "1381", OPTseries = "LNCS", publisher = "Springer LNCS~1381", address = "Lisbon" } @Article{cooper+98, author = "Keith D. Cooper and Timothy J. Harvey and Linda Torczon", title = "How to Build an Interference Graph", journal = spe, year = "1998", volume = "28", number = "4", pages = "425--444", url = "http://softlib.cs.rice.edu/MSCP/papers/hash.ps.gz", annote = "Empirically compares the bit-matrix representation of interference graphs to a hash-table representation. Also presents some improvements to the classical bit-matrix approach: they use separate graphs for integer and FP registers; and they don't use an extra pass to determine the sizes of the edge lists, but use an extensible edge-list representation. The empirical results are for graphs for FORTRAN functions with up to 5936 nodes and up to 723605 edges; the split bit-matrix is the best method both speedwise and in size for most graphs; the best hashing method beats the split bit-matrix sizewise in one case and speedwise in three cases (out of 169). The authors admit that for even larger graphs the hash table would be more beneficial." } @InProceedings{appelbe+98, author = "Bill Appelbe and Raja Das and Reid Harmon", title = "Future Branches -- Beyond Speculative Execution", crossref = "acac98", pages = "1--13", annote = "Future branches instruction are a modern variant of branch delay slots: The branch contains the address where it will take effect, allowing the branch to move up a considerable distance and reducing branch prediction miss penalties. To deal with misspeculation, there is also an \emph{undo future branch} instruction. The implementation they propose uses a fully associative \emph{pending branch table}, which is managed as a queue, and whose minimum size is programmer-visible (the compiler must avoid too many pending branches)." } @InProceedings{biglari-abhari+98, author = "Morteza Biglari-Abhari and Michael J. Liebelt and Kamran Eshraghian", title = "Implementing a {VLIW} Compiler: Motivation and Trade-offs", crossref = "acac98", pages = "37--46" } @InProceedings{channon&koch98, author = "David Channon and David Koch", title = "A Study of Sparse 2-Dimensional Translation Lookaside Buffers", crossref = "acac98", pages = "47--56", annote = "Compares various TLB implementations for very large address spaces." } @InProceedings{littin+98, author = "Richard H. Littin and J. A. David McWha and Murray W. Pearson and John G. Cleary", title = "Block Based Execution and Task Level Parallelism", crossref = "acac98", pages = "57--66", annote = "Proposes and evaluates a machine with an instruction set consisting of fixed-length basic blocks, that differentiates between intra-basic-block data flow and inter-basic-block data flow, similar to the multiscalar approach \cite{sohi+95}." } @InProceedings{omondi98, author = "Amos R. Omondi", title = "Fast Floating-Point Addition Without Operand Conversion", crossref = "acac98", pages = "145--155", annote = "Presents an FP adder that does addition of the mantissae in sign-magnitude representation instead of converting it to two's complement representation." } @InProceedings{siemers&moeller98, author = "Christian Siemers and Dietmar P. F. M{\"o}ller", title = "The $>$S$<$puter: A Novel Microarchitecture Model for Execution inside Superscalar and {VLIW} Processors Using Reconfigurable Hardware", crossref = "acac98", pages = "169--178", annote = "The microarchitecture presented here contains a number of functional units with reconfigurable interconnects. The decoder deals with a basic block at a time and configures the FU network for the instructions in the basic block." } @InProceedings{szyperski+98, author = "Clemens Szyperski and Paul Roe and Siu Yuen Chan and Geoff Elgey", title = "Garden's Autobahn: Efficient and Safe Streaming of Data Structures for High Performance Communication Architectures", crossref = "acac98", pages = "193--203", annote = "Presents a combination of language feature, compiler technique and hardware for transmitting irregular data structures over a network, with many desirable properties, such as efficiency." } @InProceedings{zhu&wong98, author = "Y Zhu and W. F. Wong", title = "The Effect of Instruction Dependency on Superscalar Processor Performance", crossref = "acac98", pages = "215--226", annote = "Presents a theoretical model for predicting performance for a class of superscalar machines; this allows running a benchmark once (to determine the benchmark characteristics important for the model), and predicting the performance of several microarchitectures. They also present some empirical data on the accuracy of their results (-19\%--14\% error for the IPC predictions)." } @Proceedings{acac98, title = "Computer Architecture 98 (ACAC '98)", booktitle = "Computer Architecture 98 (ACAC '98)", year = "1998", key = "ACAC '98", editor = "John Morris", volume = "20", number = "4", series = "Australian Computer Science Communications", publisher = "Springer", address = "Perth" } @Article{bhamidipaty&proebsting98, author = "Achyutram Bhamidipaty and Todd A. Proebsting", title = "Very Fast YACC-Compatible Parsers (For Very Little Effort)", journal = spe, year = "1998", volume = "28", number = "2", pages = "181-190", annote = "Presents a simple approach to generate hard-coded parsers, once you have an LALR automaton. They give special care to being completely yacc-compatible, including error handling. The resulting parsers are 2--6 times faster than yacc-generated parsers (bison-generated parsers are slightly slower still), but consume more memory (up to five times as large for gcc's parser, that is 75KB)." } @Article{boehm&weiser88, author = "Hans-Juergen Boehm and Mark Weiser", title = "Garbage Collection in an Uncooperative Environment", journal = spe, year = "1988", volume = "18", number = "9", pages = "807--820", annote = "One of the first papers on conservative garbage collection. Their freelist implementation allows them to find out quickly whether a piece of memory is managed by their garbage collector, and whether a value is a valid pointer to the start of an object. They report experiences in the context of their Russel implementation, for replacing the normal allocator in two large C programs, and for using the technique in debugging (to find leaks and premature frees)." } @Article{wentworth90, author = "E. P. Wentworth", title = "Pitfalls of Conservative Garbage Collection", journal = spe, year = "1990", volume = "20", number = "7", pages = "719--727", annote = "Does some measurements on space leakage of conservative garbage collectors due to pointer misidentification, in a 16-bit address space. LISP programs exhibit a pretty constant leakage overhead, which is acceptable. OTOH, some KRC programs exhibit unbounded leakage, due to the use of lazy lists: if some list element is referenced by a spurious pointer, the whole list (growing into infinity) starting at that element will be retained." } @Article{zorn93, author = "Benjamin Zorn", title = "The Measured Cost of Conservative Garbage Collection", journal = spe, year = "1993", volume = "23", number = "7", pages = "733-756", techreport-url = "ftp://ftp.cs.colorado.edu/pub/cs/techreports/zorn/CU-CS-573-92.ps.Z", annote = "Presents empirical data on the performance of four malloc/free implementations and the Boehm-Weiser garbage collector on six C programs. The Boehm-Weiser collector was competetive speedwise, but required up to 2.5 times as much memory as the most space-efficient malloc/free library. Consequently, it does not perform as well as the others for certain physical memory sizes. Four of the programs contain additional memory managers, but their use generally does not help space or time-wise; in several cases, there is even a slowdown; choosing a good general-purpose allocator seems more worthwhile. The Boehm-Weiser collector had a bad effect on cache performance, probably due to it's freelist design and the effects of the marking pass." } @Article{nevill-manning+98, author = "Craig G. Nevill-Manning and Todd Reed and Ian H. Witten", title = "Extracting Text from Postscript", journal = spe, year = "1998", volume = "28", number = "5", pages = "481--491", annote = "Uses a distiller-like technique to extract plain text from postscript. Word breaks and character breaks are recognized by the spacing. No technique for extracting more structural information is presented." } @Article{allen+98, author = "Vicki Allan and Steven J. Beatty and Bogong Su and Philip H. Sweany", title = "Building a Retargetable Local Instruction Scheduler", journal = spe, year = "1998", volume = "28", number = "3", pages = "249--283", annote = "Discusses some topics in instruction scheduling. They present a complex timing model, explore some variations of scheduling algorithms: instruction-driven vs. operation-driven list scheduling, foresight and lookahead to avoid scheduling failures (possible with their timing model) and different scheduling directions. They discuss the integration of scheduling and register allocation. In their experimental evaluation, operation-driven scheduling looks better than instruction-driven scheduling, and the decision between forward and reverse scheduling depends on the architecture (they suggest taking the better of both). They also evaluate heuristics: optimizing a linear polynomial of 24 heuristics with a genetic algorithm beats the best of 2500 randomly generated polynomial by 5\%; however, I wonder if this speedup resulted from optimizing for their test cases, and if it would also show up for other test cases. Some questionable results (e.g., that straight list scheduling produced scheduling failures for the Alpha architecture) throw doubt on the whole paper." } @Article{norris&pollock98, author = "Cindy Norris and Lori L. Pollock", title = "The Design and Implementation of RAP: A PDG-based Register Allocator", journal = spe, year = "1998", volume = "28", number = "4", pages = "401--424", annote = "Presents a hierarchical register allocation algorithm that differs from \cite{callahan&koblenz91} in the following: It uses the PDG's regions instead of single-entry single-exit tiles; and it does everything on a bottom-up pass instead of delaying register assignment until a top-down pass. It does not exploit the PDG's partial ordering of instructions. The empirical results (for small benchmarks on register-starved machines) show a slow-down of 1\%--4\% in the generated code compared to a comparable non-hierarchical register allocator. The main reason seems to be the way spill code is inserted." } @Article{shaw88, author = {George W. Shaw}, title = {Forth Shifts Gears}, journal = {Computer Language}, year = {1988}, pages = {67--75 (May), 61--65 (June)}, annote = {Discusses multiple code fields, their application for implementing \texttt{value} and \texttt{defer}, and how they are implemented. Also discusses state-smart words and how to use MCFs to avoid them.} } @Book{papadimitriou&steiglitz82, title = "Combinatorial Optimization, Algorithms and Complexity.", author = "Christos H. Papadimitriou and Kenneth Steiglitz", publisher = "Prentice-Hall", address = "Englewood Cliffs, NJ", year = "1982", } @Misc{paysan98, author = {Bernd Paysan}, title = {Re: State-smart etc Was: Re: Facelifting my Forth }, howpublished = {Usenet newsgroup comp.lang.forth, message 351B70D4.F31@\linebreak[0]remove.muenchen.this.org.junk}, month = mar, year = {1998}, annote = {Describes how to implement \texttt{interpret/compile:}.} } @Misc{bradley96, author = {Mitch Bradley}, title = {{Re: Another solution for RFIs 8 and 9}}, howpublished = {Message 9609231729.AA06128@FirmWorks.COM to the mailing list ansforth@minerva.com}, year = {1996}, month = sep } @Misc{gforth, key = {Gforth}, title = {Gforth home page}, howpublished = {http://www.complang.\linebreak[0]tuwien.ac.at/forth/gforth/} } @Misc{proebsting98, author = {Todd Proebsting}, title = {Least-Cost Instruction Selection in {DAG}s is {NP}-Complete}, howpublished = {http://research.microsoft.com/\~{}toddpro/papers/\linebreak[0]proof.htm}, url = {http://research.microsoft.com/~toddpro/papers/proof.htm}, year = {1998} } @InProceedings{eichenberger+1995, author = "Alexandre E. Eichenberger and Edward S. Davidson and Santosh G. Abraham", booktitle = "Proceedings of the 1995 International Conference on Supercomputing", title = "Optimum Modulo Schedules for Minimum Register Requirements", year = "1995", url = "http://www.eecs.umich.edu/PPP/ICS95.ps", keywords = "Software pipelining, Register sensitive modulo scheduling, instruction level paralelism, VLIW, Superscalar", month = jul, pages = "31--40" } @ARTICLE{moon&ebcioglu97, AUTHOR = {S. Moon and K. Ebcioglu}, TITLE = {Parallelizing Nonnumerical Code with Selective Scheduling and Software Pipelining}, JOURNAL = {ACM Transactions on Programming Languages and Systems}, VOLUME = 19, NUMBER = 6, PAGES = {853--898}, MONTH = {November}, YEAR = 1997} @InProceedings{bodik+98, author = {Rastislav Bod\'ik and Rajiv Gupta and Mary Lou Soffa}, title = {Complete Removal of Redundant Expressions}, crossref = {sigplan98}, pages = {1--14}, annote = {Partial redundancy elimination sometimes is hindered by control flow joins. This paper explores code replication to eliminate this problem, but tries to limit code growth by replicating only those blocks necessary, or by profile-guided \emph{selective restructuring} or speculation. These improvements allowed the removal of up to 2\% of the dynamically executed instructions (but they indicate that this disappointing result may be an artifact of the basic compiler used).} } @InProceedings{lo+98, author = {Raymond Lo and Fred Chow and Robert Kennedy and Shin-Ming Liu and Peng Tu}, title = {Register Promotion by Sparse Partial Redundancy Elimination of Loads and Stores}, crossref = {sigplan98}, pages = {26--37}, annote = {Register promotion is performed by performing partial redundancy elimination of the loads and stores of the values to be promoted. For loads, this is the same as normal partial redundancy elimination and can be perfomed in the SSAPRE framework. For stores, the paper introduces a dual of SSAPRE, SSUPRE (U=use). The improvement through PRE of stores is small, however (only 1.2\% of all stores and 4.2\% of the redundant stores are eliminated); apparently dead and faint store elimination eliminates most redundancy before that. PRE of loads works well, however, eliminating 25.6\% of all loads. Various forms of speculation have only a small (and not always positive) effect on the number of loads and stores executed.} } @InProceedings{ammons&larus98, author = {Glenn Ammons and James R. Larus}, title = {Improving Data-Flow Analysis with Path Profiles}, crossref = {sigplan98}, pages = {72--84}, annote = {Improves data-flow analysis by extending the control-flow graph into a hot-path graph, performing data-flow analysis on this graph, then eliminating unnecessary dupicates. The paper evaluates the technique by applying it to constant propagation, resulting in speedups of -4.4\%--9.8\%.} } @InProceedings{diwan+98, author = {Amer Diwan and Kathryn S. McKinley and J. Eliot B. Moss}, title = {Type-Based Alias Analysis}, crossref = {sigplan98}, pages = {106--117}, annote = {An empirical evaluation of three type-based alias analyses for Modula-3. The simplest analysis is quite imprecise using the established metric (average number of intraprocedural aliases of a reference), but the more sophisticated analyses (based on taking field accesses into account; and based on a flow-insensitive analysis of variable assignments and references) are much better. In the bottom line, for (fully) redundant load elimination, they all perform about equally well, resulting in speedups of 1\%--8\%. They also compare their analysis to an upper bound and conclude that at most 2.5\% of the remaining loads could be eliminated by more precise analysis.} } @InProceedings{sastry+98, author = {S. Subramanya Sastry and Subbarao Palacharla and James E. Smith}, title = {Exploiting Idle Floating-Point Resources for Integer Execution}, crossref = {sigplan98}, pages = {118--129}, annote = {They propose adding integer operations to floating-point units (like MMX was added). They evaluate a compilation technique that simply puts all integer operations in the FP unit that do not have to be in the integer unit because they are needed to compute addresses, or due to calling conventions; i.e., store-value slices and branch slices are moved into the FP unit. An advanced scheme inserts inter-partition copies and duplicates some instructions to increase the possibility for offloading. These methods offload 5.7\%--29.5\% (basic scheme) and 8.3\%--41.6\% (advanced scheme) of the integer instructions for SPECint95 into the FP unit, resulting in speedups of 1.6\%--20.8\% (2.5\%--23.1\%) on a $2+2$-issue machine and 0.8\%--17.1\% (1.0\%--18.0\%) on a $4+4$-issue machine. Interestingly, this aggressive approach had little effect on the SPECfp benchmarks, except for ear, where 18\% of the integer instructions could be offloaded, resulting in 18\% speedup on a $2+2$-issue machine.} } @InProceedings{traub+98, author = {Omri Traub and Glenn Holloway and Michael D. Smith}, title = {Quality and Speed in Linear-Scan Register Allocation}, crossref = {sigplan98}, pages = {142--151}, annote = {Presents a register allocation method called binpacking, where the program is scanned from start to end, and registers are allocated when needed (with the usual problems at control-flow joins). The paper presents an improvement called second-chance binpacking, which is similar to live range splitting for graph coloring allocators. An empirical comparison of second-chance bin-packing with a graph coloring register allocator shows an increase of 0\%--8.6\% in executed instructions and -3.4\%--8.2\% slowdown in run-time for some SPEC benchmarks. Binpacking is advertized as fast, but the register allocation times presented are not impressive (slower than graph colouring for small functions).} } @InProceedings{cheng+98, author = {Perry Cheng and Robert Harper and Peter Lee}, title = {Generational Stack Collection and Profile-Driven Pretenuring}, crossref = {sigplan98}, pages = {162--173}, annote = {Generational stack collection reduces the time needed for scanning the stack for roots by applying generational techniques; this is based on the fact that the deeper regions of the stack often do not change between two collections. Profile-driven pretenuring identifies allocation sites that usually produces long-lived data, and tenures all data allocated by these sites immediately, thus eliminating a lot of copying.} } @InProceedings{clinger98, author = {William D. Clinger}, title = {Proper Tail Recursion and Space Efficiency}, crossref = {sigplan98}, pages = {174--185}, annote = {A theoretical specification of \emph{proper tail recursion}.} } @InProceedings{wickline+98, author = {Philip Wickline and Peter Lee and Frank Pfenning}, title = {Run-time Code Generation and Modal-ML}, crossref = {sigplan98}, pages = {224--235}, annote = {Presents a variant of ML with special constructs for run-time code generation (staging), and how it is translated into an abstract machine called CCAM. Hard to read.} } @InProceedings{xi&pfenning98, author = {Hongwei Xi and Frank Pfenning}, title = {Eliminating Array Bound Checking Through Dependent Types}, crossref = {sigplan98}, pages = {249--257}, annote = {Extends ML with yet another static type constructor that, when used well by the programmer, allows the compiler to eliminate many array bound checks.} } @InProceedings{bacon+98, author = {David F. Bacon and Ravi Konuru and Chet Murthy and Mauricio Serrano}, title = {Thin Locks: Featherweight Synchronization for Java}, crossref = {sigplan98}, pages = {258--268}, annote = {They optimiza locking in Java by having thin locks in the objects that cover the frequent cases, with an escape to fat locks that are in separate tables.} } @InProceedings{adl-tabatabai+98, author = {Ali-Reza Adl-Tabatabai and Micha\l Cierniak and Guei-Yuan Lueh and Vishesh M. Parikh and James M. Stichnoth}, title = {Fast, Effective Code Generation in a Just-In-Time Java Compiler}, crossref = {sigplan98}, pages = {280--290}, annote = {Presents some techniques that are used in Intel's JIT for IA32. \emph{Lazy code selection} uses a simulated stack that contains objects representing various addressing modes; essentially an application of treeless tree parsing code selection to RAFTS. \emph{Common subexpression elimination} compares sequences of byte codes, and only tries to find expressions common with expressions present in registers. Register allocation gives four registers to local variables, and three registers plus any free variable registers to local register allocation. A very simple array bound checking optimization is implemented that can eliminate bounds checking for some accesses with constant indexes. Exception code is moved out-of-line. They also describe how they deal with garbage collection. Experiments show that the most important optimization is register allocation, the others have little influence. The compile time is 0.5--1s for the benchmarks used.} } @InProceedings{piumarta&riccardi98, author = {Ian Piumarta and Fabio Riccardi}, title = {Optimizing Direct Threaded Code by Selective Inlining}, crossref = {sigplan98}, pages = {291--300}, url = {ftp://ftp.inria.fr/INRIA/Projects/SOR/papers/1998/ODCSI_pldi98.ps.gz}, annote = {They reduce the overhead of a direct threaded interpreter by combining all the VM instructions in a basic block into a single virtual machine instruction. This is done by simply concatenating the machine code for the virtual machine instructions (except for the Next code). Multiple instances of the same sequence are just translated once, and then reused. They evaluated this technique empirically in the context of a fine-grained RISC-like VM, and in an Objective Caml interpreter. The speedup over plain direct threaded code for the RISC-like VM is a factor 1.33--2.06. For the Caml VM the speedups varied with benchmark and processor, from 1 to 2.2. The code expansion ranges from 2.2--4 for the Sparc, with larger benchmarks having less expansion (due to more reuse of sequences). Implementing this technique on the Caml VM took only one day.} } @InProceedings{ayers+98, author = {Andrew Ayers and Stuart de Jong and John Peyton and Richard Schooler}, title = {Scalable Cross-Module Optimization}, crossref = {sigplan98}, pages = {301--312}, annote = {Optimization takes too much memory (1.7KB per source line) to allow straight-forward whole-program optimization in main memory. This paper describes various forms of compacting the intermediate representations, up to offloading the IR to disk. This reduces the memory consumption for optimizing 126.gcc from 250MB to 25MB, while increasing optimization time from 18~min to 30~min. The paper also presents other interesting issues in whole-program optimization, like speedups for SPECint95 benchmarks (1.1--2.25) and huge CAD applications (1.4--1.7), dealing with bugs uncovered by optimization, and build compatibility.} } @InProceedings{gay&aiken98, author = {David Gay and Alex Aiken}, title = {Memory Management with Explicit Regions}, crossref = {sigplan98}, pages = {313--323}, annote = {Does an empirical evaluation of region/arena/zone-based memory allocation, and proposes and evaluates a safe version of this technique, based on reference counting (references into a whole region).} } @InProceedings{sweeney&tip98, author = {Peter F. Sweeney and Frank Tip}, title = {A Study of Dead Data Members in C++ Applications}, crossref = {sigplan98}, pages = {324--332}, annote = {On average, 12.5\% of the data members are dead, taking up 4.4\% of the space.} } @InProceedings{necula&lee98, author = {George C. Necula and Peter Lee}, title = {The Design and Implementation of a Certifying Compiler}, crossref = {sigplan98}, pages = {333-344}, annote = {The use an optimizing compiler in combination with an assembly language certifyer.} } @Proceedings{sigplan98, booktitle = "SIGPLAN '98 Conference on Programming Language Design and Implementation", title = "SIGPLAN '98 Conference on Programming Language Design and Implementation", year = "1998", key = "PLDI '98" } @Book{rubinstein88, author = {Richard Rubinstein}, title = {Digital Typography}, publisher = {Addison-Wesley}, year = {1988}, annote = {Discusses many issues involved in typography and the problems of dealing with it automatically. Includes an annotated bibliography.} } @InProceedings{park&moon98, author = {Jinpyo Park and Soo-Mook Moon}, title = {Optimistic Register Coalescing}, booktitle = {Parallel Architectures and Compilation Techniques (PACT '98)}, pages = {196--204}, year = {1998} } @InProceedings{simons98, author = {Anthony J. H. Simons}, title = {Borrow, Copy or Steal? Loans and Larceny in the Orthodox Canonical Form}, crossref = {oopsla98}, pages = {65--83}, annote = {Gives a good discussion of dealing with memory (de)allocation and change semantics in C++ (and other languages without automatic storage reclamation). The standard solution is to copy everywhere. This paper presents borrowing (copy-on-write), and stealing, a more complex scheme, where the user of the object ensures that the reference to the object is dead, and therefore no copying is necessary. The techniques are useful in different circumstances.} } @InProceedings{litvinov98, author = {Vassily Litvinov}, title = {Constraint-Based Polymorphism in Cecil: Towards a Practical and Static Type System}, crossref = {oopsla98}, pages = {388--411}, annote = {Presents a type system for Cecil (and other object-oriented programming languages), and uses it for statically type-checking Vortex, a 100000 line Cecil program. Apart from adding type annotations, very little code needed to be rewritten. But the static typechecker also found only few bugs.} } @Proceedings{oopsla98, title = "Conference on Object-Oriented Programming Systems, Languages \& Applications (OOPSLA '98)", booktitle = "Conference on Object-Oriented Programming Systems, Languages \& Applications (OOPSLA '98)", year = "1998", key = "OOPSLA '98", } @InProceedings{seidl&zorn98, author = {Matthew L. Seidl and Benjamin G. Zorn}, title = {Segragating Heap Objects by Reference Behaviour and Lifetime}, crossref = {asplos98}, pages = {12--23}, annote = {Uses a profile-feedback method to predict, whether an object is highly reference, not highly referenced, short-lived or other. Several predictors were used (plain allocation site is not sufficient), in particular call path and stack content. The stack content predictor did quite well for some programs and some VM sizes, but overall I was not really convinced.} } @InProceedings{sodani&sohi98, author = {Avinash Sodani and Gurindar S. Sohi}, title = {An Empirical Analysis of Instruction Repetition}, crossref = {asplos98}, pages = {35--45}, annote = {Previous papers have observed that many dynamically executed instructions have the same inputs (and thus the same outputs) as earlier instances of the same instruction. This paper analyses where these repetitions come from; it not only considers instructions where all instances have the same arguments, but uses a repetition buffer with up to 1000 instances. The results are: Very few static instructions cause most of the dynamic repetition (except for m88ksim); 56\%--99\% of the dynamic instructions are repetitions; 5\%--40\% of the repetitions are due to instructions with a single instance. The paper uses data flow analysis on traces to determine where the repeated inputs come from, globally: external input, global initialized data, program internals (immediate and immediate-derived, e.g., loop counters), uninitialized data. Internals dominate (52\%--86\% of repeated instructions, followed by global init (14\%--30\%) and external (0\%--30\%); uninit plays a minor role ($<$1\%). The paper also does an analysis that takes functions into account, in particular argument repetition: 59\%--98\% of dynamic function calls are repetitions of a call with all arguments same; Only 0\%--16\% of the dynamic calls do not have a single repeated argument. The paper breaks down the instruction repetitions in terms of their role in the function. One interesting result peripheral to the paper is that the function prologue and epilogue code takes 2\%--25\% of the executed instructions. It also comments on the possibility of exploiting the repetitions in software, but is not very encouraging. It also comments on hardware exploitation, which could capture 30\%--74\% of all dynamic instructions, but does not look very useful to me.} } @InProceedings{lee+98, author = {Walter Lee and Rajeev Barua and Matthew Frank and Devabhaktuni Srikrishna and Jonathan Babb and Vivek Sarkar and Saman Amarasinghe}, title = {Space-Time Scheduling of Instruction-Level Parallelism on a Raw Machine}, crossref = {asplos98}, pages = {46--57}, annote = {The Raw machine consists of a number of R2000-based tiles, each of which is a complete processor. The execution model used is called NURA (non-uniform register access), which allows to exploit instruction-level parallelism. The paper describes and evaluates the scheduler for this architecture. One of the interesting results is that the multiple pcs (allowing partially asynchronous operation) make the performance much less vulnerable to effects that are not statically predictable.} } @InProceedings{hammond+98, author = {Lance Hammond and Mark Willey and Kunle Olukotun}, title = {Data Speculation Support for a Chip Multiprocessor}, crossref = {asplos98}, pages = {58--69}, annote = {The machine architecture here uses parallelism among small threads, with all communication happening through memory, and some hardware support for speculation and synchronization.} } @InProceedings{machanick+98, author = {Philip Machanick and Pierre Salverda and Lance Pompe}, title = {Hardware-Software Trade-Offs in a Direct Rambus Implementation of the RAMpage Memory Hierarchy}, crossref = {asplos98}, pages = {105--114}, annote = {Proposes and evaluates using software-controlled paging between SRAM and DRAM instead of using hardware-controlled cache policies. That would allow getting rid of the tags of the last cache level, and using this chip area for other purposes, among other things. The evaluation shows that managing DRAM with paging is competetive.} } @InProceedings{roth+98, author = {Amir Roth and Andreas Moshovos and Gurindar S. Sohi}, title = {Dependence Based Prefetching for Linked Data Structures}, crossref = {asplos98}, pages = {115--126}, annote = {Proposes and evaluates a hardware mechanism that records access patterns and prefetches memory locations that will probably be needed by pointer-chasing code.} } @InProceedings{le98, author = {Bich C. Le}, title = {An Out-of-Order Execution Technique for Runtime Binary Translators}, crossref = {asplos98}, pages = {151--158}, annote = {The paper attacks the problem of dealing with exceptions in a binary translated, scheduled program. It uses a checkpointing approach. The task is particularly simple because the paper has a 1:1 mapping of architectural registers of the emulated machine to registers of the executing machine, and the executing machine has extra registers. Each superblock start is a checkpoint, with all values in the emulated architectural registers. One performance problem with this compiler-based reordering is false exceptions (exceptions speculatively taken by the translated code but not by the emulated code); this problem is solved by retranslating the superblock without scheduling after one occurence of a false exception.} } @InProceedings{stark+98, author = {Jared Stark and Marius Evers and Yale N. Patt}, title = {Variable Length Path Branch Prediction}, crossref = {asplos98}, pages = {170--179} } @InProceedings{temam98, author = {Olivier Temam}, title = {Investigating Optimal Local Memory Performance}, crossref = {asplos98}, pages = {218--227}, url = {http://www.lri.fr/~temam/Articles/old-Te98.ps.gz}, annote = {Presents an optimal algorithm (with foresight) for exploiting both space and time locality and uses it for evaluating areas where the current caching policies could benefit. One of the conclusions is that the meat in improving caches is in improving the replacement policy because many of the words contained in a cache are dead and can be replaced without causing a miss. A surprising result is that the usual way of determining cache lines gives better results than methods that appear less arbitrary.} } @InProceedings{citron+98, author = {Daniel Citron and Dror Feitelson and Larry Rudolph}, title = {Accelerating Multi-Media Processing by Implementing Memoing in Multiplication and Division Units}, crossref = {asplos98}, pages = {252--261} } @InProceedings{fu+98, author = {Chao-ying Fu and Matthew D. Jennings and Sergei Y. Larin and Thomas M. Conte}, title = {Value Speculation Scheduling for High Performance Processors}, crossref = {asplos98}, pages = {262--271}, annote = {Combines hardware for predicting results of instructions with a scheduling technique like run-time disambiguation.} } @InProceedings{ranganathan&franklin98, author = {Narayan Ranganathan and Manoj Franklin}, title = {An Empirical Study of Decentralized ILP Execution Models}, crossref = {asplos98}, pages = {272--281}, annote = {Compares different models of distributing instructions among different units in a CPU: execution unit based decentralization (EDD) is the normal way of grouping instructions by functional unit. Data dependence based decentralization (DDD) groups instructions together that have a data dependence relationship; Control Dependence based decentralization (CDD) groups instructions that have the same control dependences (the CPU/superblock style hardware). The paper assumes that inter-execution-unit communication happens in an uni- or bidirectional ring (not very realistic for DDD). The paper analyses how these models scale to more execution units: EDD scales pretty badly. The smarter DDD algorithm performs best, but loses performance beyond 8 EUs (probably because of the ring architecture), and is not worthwhile beyond 4 EUs. CDD has no such thrashing, but manages to catch up with DDD's 4-EU performance only with 8--16 EUs.} } @InProceedings{schnarr&larus98, author = {Eric Schnarr and James R. Larus}, title = {Fast Out-of-Order Processor Simulation Using Memoization}, crossref = {asplos98}, pages = {283--294}, annote = {The simulator described in the paper performs cycle-accurate simulation by executing code fragments directly for functional simulation, intermixed with a memoizing computation of what happens at the cycle level. The result is a speedup factor of 5--12.} } @InProceedings{jacob&mudge98, author = {Bruce L. Jacob and Trevor N. Mudge}, title = {A Look at Several Memory Management Units, TLB-Refill Mechanisms, and Page Table Organizations}, crossref = {asplos98}, pages = {295--306}, annote = {Describes several MMU/TLB/Page Table Organizations of various machines and their operating systems, and evaluates and analyses the performance impact of the various choices.} } @Proceedings{asplos98, title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-VIII)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-VIII)", year = "1998", key = "ASPLOS-VIII" } @Book{giampaolo99, author = {Dominic Giampaolo}, title = {Practical File System Design}, publisher = {Morgan Kaufmann}, year = {1999}, annote = {Gives a good overview of file system design issues, with BFS (BeOS file system) as a running example; in addition to the usual stuff, this book discusses journaling, indexing (BFS supports indexes over file attributes that are always up-to-date) and APIs, as well as file system benchmarking and testing.} } @InProceedings{visser+98, author = {Eelco Visser and Zine-elAbidine Benaissa and Andrew Tolmach and}, title = {Building Program Optimizers with Rewriting Strategies}, crossref = {icfp98}, pages = {13--26}, annote = {Uses a somewhat Prolog-like language for defining rewriting strategies.} } @InProceedings{findler&flatt98, author = {Robert Bruce Findler and Matthew Flatt}, title = {Modular Object-Oriented Programming with Units and Mixins}, crossref = {icfp98}, pages = {94--104}, annote = {Discusses how both the classes (mixins) and the operations in a class can be extended in MzScheme. An important concept here is the module (unit). The paper explains the basic problem and it's solution quite nicely.} } @InProceedings{finne+98, author = {Sigbjorn Finne and Daan Leijen and Erik Mejer and Simon Peyton Jones}, title = {H/Direct: A Binary Foreign Function Interface for Haskell}, crossref = {icfp98}, pages = {153--162}, annote = {The interface is defined in IDL.} } @InProceedings{karczmarczuk98, author = {Jerzy Karczmarczuk}, title = {Functional Differentiation of Computer Programs}, crossref = {icfp98}, pages = {195--203}, annote = {Computational differentiation works, by computing all needed derivatives of an operation along with the operation; for a complex function, this computes all needed derivatives of the function at one point along with the function result. This paper explores how to do this in a functional context.} } @Proceedings{icfp98, title = {International Conference on Functional Programming (ICFP '98)}, booktitle = {International Conference on Functional Programming (ICFP '98)}, year = {1998}, key = {ICFP '98}, note = {SIGPLAN Notices 34(1) (1999)} } @Article{briggs+98, author = {Preston Briggs and Keith D. Cooper and Timothy J. Harvey and L. Taylor Simpson}, title = {Practical Improvements to the Construction and Destruction of Static Single Assignment Form}, journal = spe, year = {1998}, volume = {28}, number = {8}, pages = {859--881}, annote = {Presents several improvements to SSA construction and destruction: 1) Semi-pruned SSA form has almost as few $\phi$-nodes as the pruned form, but can be built faster; however, the speedup is small, especially compared to the time needed in an optimization like value numbering. 2) An improvement in stack manipulation when building SSA form. 3) Dealing with the problems of converting the parallel $\phi$-nodes into sequential copies, especially after copy folding; one of the problems they attack is register shuffling, but I don't find their algorithm and their explanation of it very impressive.} } @Article{yuen99, author = {C. K. Yuen}, title = {Stack and RISC}, journal = can, year = {1999}, volume = {27}, number = {1}, month = mar, pages = {3--9}, annote = {Discusses using out-of-order mechanisms for stack machines (stack reorder buffer). One interesting argument is that stack machines provide an advantage, because with them the death of a value is known immediately, so we do not need the writeback of the value.} } @Article{postiff+99, author = {Matthew A. Postiff and David A. Greene and Gary S. Thyson and Trevor N. Mudge}, title = {The limits of Instruction Level Parallelism in SPEC95 Applications}, journal = can, year = {1999}, volume = {217}, number = {1}, month = mar, pages = {31--34}, annote = {Some interesting results: Stack pointer updates are in the critical path of many benchmarks; there is parallelism across millions of dynamically executed instructions.} } @TechReport{steele77, author = {Guy Lewis {Steele Jr.}}, title = {Debunking the ``Expensive Procedure Call'' Myth or Procedure Call Implementations Considered Harmful or Lambda: The Ultimate Goto}, institution = {MIT AI Lab}, year = {1977}, type = {AI Memo}, number = {443}, month = oct, annote = {Takes a look at procedure calls, both from the implementation side (advocating tail-call optimization and caller-saved registers), and from the programming language design and programming side (in the context of the structured programming debate). One interesting aspect is that it advocates dividing even quite simple push-and-jump variants into several instructions (a very RISCy approach).} } @InProceedings{moessenboeck90, author = {Hanspeter {M\"ossenb\"ock}}, title = {A Generator for Production-Quality Compilers}, booktitle = {Compiler Compilers}, pages = {48--61}, year = {1990}, volume = {477}, series = {LNCS}, publisher = {Springer}, annote = {A description of Coco/R, with comments on the history and the design decisions. About half of the paper deals with error recovery.} } @Article{fleming&wallace86, author = {Philip J. Fleming and John J. Wallace}, title = {How not to Lie with Statistics: The Correct Way to Summarize Benchmark Results}, journal = cacm, year = {1986}, volume = {29}, number = {3}, month = mar, pages = {218--221}, annote = {Advocates the use of the geometric mean in summarizing normalized benchmark results.} } @Article{smith88, author = {James E. Smith}, title = {Characterizing Computer Performance with a Single Number}, journal = cacm, year = {1988}, volume = {31}, number = {10}, month = oct, pages = {1202--1206}, annote = {Advocates the use of the arithmetic mean for times, the harmonic mean for rates, and normalizing after aggregating (i.e., not averaging normalized numbers).} } @Book{knuth99, author = {Donald E. Knuth}, title = {Digital Typography}, publisher = {CSLI Publications}, year = {1999}, address = {Stanford, CA}, annote = {A collection of articles and other stuff about \TeX and Metafont.} } @InProceedings{johnstone&wilson98, author = {Mark S. Johnstone and Paul R. Wilson}, title = {The Memory Fragmentation Problem: Solved?}, booktitle = {International Symposium on Memory Management (ISMM '98)}, pages = {26--36}, year = {1998}, volume = {34}, number = {3}, series = {SIGPLAN Notices}, annote = {Compares a number of memory allocation strategies wrt fragmentation (with explicit deallocation), using several real programs as workload (instead of a synthetic workload). A number of strategies exhibit very low fragmentation. In particular, all the best fit strategies did quite well (best with address ordered free lists), and also first fit with address ordering, and Doug Lea's allocator. The paper gives a number of explanations for these results.} } @InProceedings{chang&gibson99, author = {Fay Chang and Garth A. Gibson}, title = {Automatic I/O Hint Generation through Speculative Execution}, crossref = {osdi99}, pages = {1--14}, annote = {While the actual process is blocked by reads, a speculative thread runs ahead, and generates read hints (used for prefetching) instead of reads (and, of course, it does not produce output). If the speculative thread appears to stray from the normal execution path or fall behind, it is reinitialized from the current state of the main thread. This idea has been tested on three read-intensive programs and produced 29\%--70\% speedup on a machine with a 4-disk RAID~0 (fewer disks produce less speedup, more disks hardly more). If the hints were ignored, the slowdown was 1\%--4\%, indicating that the scheme costs little if it is useless; however, the Gnuld benchmark had a 15\% slowdown when using the hints with one disk, apparently because scarce disk bandwidth is wasted on useless prefetches. For two of the benchmarks the hints are quite accurate, but for Gnuld many hints caused prefetches of useless blocks, and useful blocks were not prefetched; Gnuld was also the only benchmark where manual prefetching seriously outperformed the new technique.} } @InProceedings{pai+99, author = {Vivek S. Pai and Peter Druschel and Willy Zwaenepoel}, title = {IO-Lite: A Unified I/O Buffering and Caching System}, crossref = {osdi99}, pages = {15--28}, annote = {IO-Lite avoids copying by using arbitrary-sized read-only buffers and passing (arrays of) descriptors for these buffers around. Thus data can be read in from a file into a web server and passed to the networking code without ever being copied. The paper reports a throughput increase of 40\%--80\% for webserving. To get all the performance advantages, the applications need to use new interfaces, but even with the stdio interface there is a measureable speedup (1.03) in gcc. The paper is a little light on how the applications and the OS manage the buffers.} } @InProceedings{wang+99, author = {Randolph Y. Wang and Thomas E. Anderson and David A. Patterson}, title = {Virtual Log Based File System for a Programmable Disk}, crossref = {osdi99}, pages = {29--43}, annote = {This paper contains a number of interesting ideas, but IMO many of the conclusions are flawed or unsupported. The first interesting idea is that nowadays disks are so intelligent that they could take over more tasks, including providing the file system (but how does that fit with RAIDs?). This paper, however, appears to focus on the idea that the disk intelligence is used to organize the disk as a log (with compaction done by the disk); the special twist in this log is that it tries to write immediately, on any free block close to the current head position (eager writing), instead of writing the log in segments. Unfortunately cleaning (the reason for having segments in normal logs) is discussed in a tech report, not in the paper. The main benefit of this scheme claimed by the paper is better performance on small synchronous writes; I see two flaws here: 1) The paper suggests alleviating the read performance problems of this scheme by using large caches; thus there will hardly be any disk reads between writes and eager writing should not perform significantly better than a conventional log (and their data seems to support this view). 2) The paper assumes that the pointer to the end of the log will be written to a fixed block upon power failure; if the disk is able to do that, it should also be able to write a full track of data to the disk upon power failure; the disk could use this to delay writes, as if it had a full track of NVRAM, and use this to optimize writing and thus alleviate the small-synchronous-writes problem. The paper gives numbers from synthetic benchmarks using a simulation of the disks (apparently assuming that the disk writes synchronously), showing significant improvements of the virtual log disk for UFS, and for some scenarios also over a log-structured file system.} } @InProceedings{dougan+99, author = {Cort Dougan and Paul Mackeras and Victor Yodaiken}, title = {Optimizing the Idle Task and Other {MMU} Tricks}, crossref = {osdi99}, pages = {229--237}, url = {http://hq.fsmlabs.com/~cort/papers/linuxppc-mm/linuxppc-mm.ps}, annote = {Discusses various optimizations of memory management stuff in Linux/PPC. The optimizations are: mapping the kernel with BAT (block address translation) registers instead of the TLB; better choice of segment IDs (VSIDs) to get a higher hash table hit ratio; hand optimizing the TLB miss code; on the 603, don't use the hash tables upon TLB miss, use the page tables directly; instead of flushing stale entries from the TLB and hash table, just change the involved VSID; turn off the cache on TLB miss to avoid polluting the cache with page table entries; clear free pages in the idle task, with caches turned off. Not all of these optimizations are supported with convincing data in the paper, but the effect of their combination is quite good. One interesting result was that apparently the kernel compile benchmark was originally suffering quite a lot from TLB misses (just mapping the kernel with BATs reduced wall-clock time by a factor 1.25).} } @InProceedings{hutchinson+99, author = {Norman C. Hutchinson and Stephen Manley and Mike Federwisch and Guy Harris and Dave Hitz and Steven Kleiman and Sean O'Malley}, title = {Logical vs. Physical File System Backup}, crossref = {osdi99}, pages = {239--249}, annote = {Discusses and measures logical (i.e., file-based) vs. physical (i.e., block-based) file system backups, in particular in the context of the WAFL file system; WAFL (a variation on log-structured file systems) makes it possible to have incremental physical backups. On the performance side, physical backup is about 20\% faster than dump for a single tape drive, uses much less CPU (5\% vs. 25\%), and scales better for more tape drives (logical backup becomes disk-bound with four tapes and also consumes 90\% of the CPU). The paper also contains a bit of info on WAFL.} } @InProceedings{groenvall+99, author = {Bj\"orn Gr\"onvall and Assar Westerlund and Stephen Pink}, title = {The Design of a Multicast-Based Distributed File System}, crossref = {osdi99}, pages = {251--264}, annote = {Describes the design of JetFile.} } @InProceedings{gopal&manber99, author = {Burra Gopal and Udi Manber}, title = {Integrating Content-Based Access Mechanisms with Hierarchical File Systems}, crossref = {osdi99}, pages = {265--278}, annote = {Includes a long Related Work section.} } @Proceedings{osdi99, title = {Operating Systems Design and Implementation (OSDI '99)}, booktitle = {Operating Systems Design and Implementation (OSDI '99)}, year = {1999}, key = {OSDI '99} } @PhdThesis{pelegri-llopart88, author = {Eduardo Pelegri-Llopart}, title = {Rewrite Systems, Pattern Matching, and Code Generation}, school = {University of California, Berkeley}, year = {1988} } @InProceedings{serrano97, author = {Manuel Serrano}, title = {Inline Expansion: \emph{When} and \emph{How}}, booktitle = {Programming Languages, Implementation and Logic Programming (PLILP)}, pages = {143--157}, year = {1997}, volume = {1292}, series = {LNCS}, publisher = {Springer}, annote = {Examines a number of heuristics for determining which calls to inline and then presents his own; unfortunately the empirical evaluation does not compare the approaches, only different decision functions and their parameters for the new framework. The \emph{how} part discusses how to treat recursive functions. One interesting effect is that the worst code growth of any combination of decision function and parameters is only 1.08 and there are even code shrinks down to a factor 0.82 (for Scheme benchmarks).} } @InProceedings{debosschere+94, author = {Koen De Bosschere and Saumya Debray and David Gudeman and Smapath Kannan}, title = {Call Forwarding: A Simple Interprocedural Optimization Technique for Dynamically Typed Languages}, crossref = {popl94}, pages = {409--420}, annote = {Attacks the problem of type checks at the entry of procedures in dynamically-typed languages, or, in general, any kind of entry action (e.g., unboxing). For some call sites some of these entry actions are unnecessary (e.g., for typechecks, because the type is known). The paper tries to exploit this for optimization by having several entry points for the procedure, and every call site calling an entry point that precedes all necessary (and possibly some unnecessary) entry actions. The entry actions can be ordered to minimize executing unnecessary entry actions; this problem is NP-complete. Another approach to dealing with the problem is to inline some entry actions into the call sites; the paper claims that this apprach leads to significant code bloat in a number of application programs. The paper introduces a greedy algorithm for ordering the entry actions and performing bounded inlining of entry actions; this algorithm performs optimally for all presented benchmarks, but the paper does not tell what bound on code growth was used (except maybe in footnote 3; it appears to imply that the implementation of the algorithm copies just enough to avoid all unnecessary entry actions; then the optimiality is trivial, but what about code growth (no data presented for that)?). The improvements in execution time are 12\%--45\%.} } @Proceedings{popl94, booktitle = "Principles of Programming Languages (POPL '94)", title = "Principles of Programming Languages (POPL '94)", year = "1994", key = "POPL '94" } @InProceedings{goubault94, author = {Jean Goubault}, title = {Generalized Boxings, Congruences and Partial Inlining}, booktitle = {Static Analysis Symposium (SAS '94)}, pages = {147--161}, year = {1994}, volume = {864}, series = {LNCS}, publisher = {Springer}, annote = {Discusses several ways to optimize boxing and unboxing. One of the approaches is partial inlining: inlining the unboxing at the start and the unboxing at the end into the caller, and then optimizing it away; this can be extended to also inline tests like ML's pattern matching code. The paper reports no practical experience with partial inlining.} } @Article{kaser&ramakrishnan98, author = {Owen Kaser and C. R. Ramakrishnan}, title = {Evaluating Inlining Techniques}, journal = complang, year = {1998}, volume = {24}, OPTnumber = {}, pages = {55--72}, annote = {Inlining the original versions of procedures is more powerful than inlining the current version (which may have some inlinings already applied), but the difference only plays a role for recursive procedures. The paper also discusses an inlining heuristic based on call frequencies; with this heuristic (and probably others), original-version inlining thends to have problems with local maxima, so the paper proposes a hybrid strategy. The paper presents an empirical comparison of the techniques it presents and some others; the hybrid strategy removes 36\%--97\% of all inlinable calls with 5\% code expansion and 62\%--100\% with 20\% code growth.} } @Book{wulf+75, author = {William Wulf and Richard K. Johnsson and Charles B. Weinstock and Steven O. Hobbs and Charles M. Geschke}, title = {The Design of an Optimizing Compiler}, publisher = {Elsvier}, year = {1975}, isbn = {0-444-0164-6}, annote = {Describes a complete Bliss/11 compiler for the PDP-11. It uses some interesting techniques: it uses a (hand-constructed) tree parsing automaton for parts of the code selection (Section~3.4); it optimizes the use of unary complement operators (Section~3.3); it uses a smart scheme to represent a conservative approximation of the lifetime of variables in constant space and uses that for register allocation (Sections~4.1.3 and~4.3).} } @Article{rosenblum&ousterhout92, author = {Mendel Rosenblum and John K. Ousterhout}, title = {The Design and Implementation of a Log-Structured File System}, journal = tocs, year = {1992}, volume = {10}, number = {1}, pages = {26--52}, annote = {Gives a rather high-level description of the Sprite log-structured file system; there are too many details missing for my taste. But the discussion of cleaning and, in particular, cleaning policies is quite detailed; the paper compares a greedy heuristic that cleans segments with least utilization, with a cost/benefit heuristic that tends to clean segments with old (and rarely-changing) data with higher utilization than young segments; the cost/benefit heuristic works significantly better. The intuition is that old data is unlikely to die soon, so we might just as well reclaim the free space associated with it now and use it, instead of waiting for a long time until utilization falls below the threshold of the greedy heuristic. The file system maintains a segment usage table to have data for this heuristic. The paper also presents empirical data showing that the Sprite LFS is much faster than SunOS FFS on small files, and that the cleaning behaviour on real workloads is even better than in the simulations they presented earlier. One interesting statement was that the Sprite LFS is not more complex than FFS: it needs log handling and the cleaner, but can skip complex block allocation algorithms and fsck.} } @InProceedings{tweedie98, author = {Stephen Tweedie}, title = {Journaling the {Linux} ext2fs Filesystem}, booktitle = {LinuxExpo '98}, year = {1998}, url = {ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz}, annote = {A nice description of adding journaling to a conventional file system.} } @InProceedings{mckusick&ganger99, author = {Marshall Kirk McKusick and Gregory R. Ganger}, title = {Soft Updates: A Technique for Eliminating Most Synchronous Writes in the Fast Filesystem}, crossref = {freenix99}, pages = {1--17}, year = {1999}, annote = {Soft updates enhance the BSD FFS for safe asynchronous writing and fast crash recovery. The data and meta-data is written in such an order that the only inconsistency on disk can be allocated, but unused blocks or inodes. Thus the file system can be mounted without being checked. Soft updates also increase the speed of updating the file system by requiring fewer writes and hardly any synchronous writes. The paper gives a pretty detailed account of the changes in FFS, discusses the experiences, and provides some empirical results (showing that soft updates can be quite a bit faster than FFS's synchronous metadata updates and close to plain asynchronous updates). The paper also explains how to do snapshots on a conventional file system: A snapshot is a sparse file with the size of the file system; when a block in the file system is written, a copy of the old contents are written into the snapshot. Soft updates uses snapshots for running fsck (to reclaim lost space) and backups while having the file system mounted and writable.} } @Proceedings{freenix99, title = {FREENIX Track, USENIX Annual Technical Conference}, booktitle = {FREENIX Track, USENIX Annual Technical Conference}, year = {1999}, OPTkey = {FREENIX '99} } @InProceedings{blackwell+95, author = {Trevor Blackwell and Jeffrey Harris and Margo Seltzer}, title = {Heuristic Cleaning Algorithms in Log-Structured File Systems}, crossref = {usenix95}, pages = {277--288}, url = {http://www.eecs.harvard.edu/~tlb/usenixw95.ps}, annote = {The authors gathered traces of file accesses by snooping the network traffic of three dedicated NFS servers in two environments (university and business). Then they used these traces on an LFS simulator to measure the effects of cleaning and evaluate some cleaning parameters (but not heuristics involving, e.g., age, utilization etc.); the file systems were 90\% full. Results: After two seconds idle time, there is a high probability (86\%--98\%) for another two seconds idle time; cleaner interference is small: an average of 0.02--0.07 cleaner requests are in the queue when a user request arrives. Most cleaning is done in the background, on-demand cleaning is rare (0\%--3.3\% of the segments cleaned). The amount of data written between idle gaps for cleaning was usually less than 16MB and never more than 350MB. 70\%--90\% of the requests could be satisfied immediately, and nearly all within 100ms.} } @InProceedings{seltzer+95, author = {Margo Seltzer and Keith A. Smith and Hari Balakrishnan and Jacqueline Chang and Sara McMains and Venkata Padmanabhan}, title = {File System Logging Versus Clustering: A Performance Comparison}, crossref = {usenix95}, OPTpages = {}, url = {http://www.eecs.harvard.edu/~margo/usenix.195/usenix.195.ps.gz}, critique-url = {http://www.scriptics.com/people/john.ousterhout/seltzer.html}, sources-url = {http://www.eecs.harvard.edu/~margo/usenix.195/}, annote = {This paper compares the BSD LFS with an improved version of BSD FFS using various benchmarks} } @InProceedings{vahalia+95, author = {Uresh Vahalia and Cary G. Gray and Dennis Ting}, title = {Metadata Logging in an NFS Server}, crossref = {usenix95}, pages = {265--276}, annote = {Describes the addition of meta-data-only journaling to BSD FFS in order to get good latency for NFS writes (hmm, doesn't NFS also require data writes to be persistent before reporting completion?). Contains some interesting discussions of various aspects (e.g., replaying logical updates is a problem, so physical logging is preferable).} } @Proceedings{usenix95, booktitle = {Usenix Annual Technical Conference}, title = {Usenix Annual Technical Conference}, year = {1995}, key = {Usenix '95} } @Article{johnson&laing96, author = {James E. Johnson and William A. Laing}, title = {Overview of the {Spiralog} File System}, journal = {Digital Technical Journal}, year = {1996}, volume = {8}, number = {2}, pages = {5--14}, url = {}, annote = {The Spiralog file system is based on a log-structured local file system, remote access provided by a client(clerk)/server combination, a low-level file system interface (VPI) on which two file system personalities are built and a backup system. This paper gives an overview, and there are companion papers \cite{whitaker+96,green+96} describing the components in more detail.} } @Article{whitaker+96, author = {Christopher Whitaker and J. Stuart Bayley and Rod D. W. Widdowson}, title = {Design of the Server for the {Spiralog} File System}, journal = {Digital Technical Journal}, year = {1996}, volume = {8}, number = {2}, pages = {15--31}, url = {http://www.digital.com/info/DTJM02/DTJM02P8.PS}, annote = {Discusses the on-disk structure and related stuff of the Spiralog file system and includes some hindsight comments. Spiralog is a log-structured file system that uses 256KB-segments for allocation and cleaning; in contrast to block-oriented FSs like the BSD FFS Spiralog's basic objects are byte streams and named cells, organized in B-trees. The file system is organized into a log driver layer that provides an infinite log address space (contrast to \cite{dejonge+93}, where a fixed-size update-in-place address space is provided), a mapping layer that maps the objects onto this log address space, and the cleaner. Each segment contains a data area and a 24KB commit record area, making it easy to distinguish commits from user data but incurring a space and time penalty. Cleaning is performed on-demand (when fewer than 300 segments are available); the cleaner's goal is to write full segments, emptying other segments is a side effect. Spiralog's backup utilities work at the log driver layer, with interesting consequences: segments that are part of a snapshot cannot be cleaned; incremental backups just backup a range in the log address space; no support for user-visible snapshots is needed or provided.} } @Article{green+96, author = {Russel J. Green and Alasdair C. Baird and J. Christopher Davies}, title = {Designing a Fast, On-line Backup System for a Log-structured File System}, journal = {Digital Technical Journal}, year = {1996}, volume = {8}, number = {2}, pages = {32--45}, url = {}, annote = {The Spiralog backup system is primarily physical, but uses log addresses instead of physical addresses (avoiding a dependence on partition sizes); it copies the live segments to tape completely (i.e., including dead blocks); incremental backups are performed by backing up a range of segments (which may conmtain only parts of changed files, but also old stuff copied by the cleaner). This avoids the need to have logical snapshots, simplifying the cleaner (with physical snapshots it just has to refrain from cleaning the segments in the snapshot).} } @InProceedings{matthews+97, author = {Jeanna Neefe Matthews and Drew Roselli and Adam M. Costello and Randolph Y. Wang and Thomas E. Anderson}, title = {Improving the Performance of Log-Structured File Systems with Adaptive Methods}, booktitle = {Sixteenth ACM Symposium on Operating System Principles (SOSP '97)}, OPTpages = {}, year = {1997}, url = {http://www.cs.berkeley.edu/~neefe/papers/sosp97/sosp97.ps}, annote = {Examines various optimizations and tuning issues for an LFS, and approaches to tune the file system automatically to the workload (self-tuning). The results are based on simulations of an LFS and a disk; the benchmarks used are the Berkeley Auspex trace, and a synthetic random update benchmark. Write cost for the Auspex trace and 85\% disk utilization is minimal at a segment size of about 4$\times$access~time$\times$bandwidth; for the random workload, write cost is minimizes at single-block segments. Hole-plugging (with metadata in a block header) is never better than cleaning for the Auspex workload, and better than cleaning at $>85\%$ utilization (later on disks with more bandwidth$\times$access time). Even at 99\% utilization, the write cost of the Auspex workload is only 3. An adaptive version that switches between hole-plugging and segment cleaning performed about as good as the best of hole-plugging and segment cleaning for the respective utilization for the random workload; for the Auspex workload it performed slightly better than either beyond 98\% utilization. Another optimization the authors propose is prefering segments that are still cached (from writing) when cleaning, saving read cost; they report some nice speedups for large cache sizes and high utilization (write cost reduction from 2.4 to 1.9 with 1GB cache and 95\% utilization); however, the evaluation seems to compare against a crippled version of the original heuristic that does not profit from the cache at all (i.e., no lucky hits in the cache), so I think it does not say much about the value of the new heuristic, but mainly about the value of caching. The paper also proposes optimizing read performance by recording frequent sequences of block reads and copying them together, with promising results; the average per-block read response times seem large, however (6.5ms--12ms); are the files in the benchmark so small? The paper gives many references.} } @InProceedings{chutani+92, author = {Sailesh Chutani and Owen T. Anderson and Michael L. Kazar and Bruce W. Leverett and W. Anthony Mason and Robert N. Sidebotham}, title = {The {Episode} File System}, booktitle = {Usenix Conference}, pages = {43--60}, year = {1992}, month = {Winter}, annote = {Episode is the local file system of the AFS 4 (and DCE DFS) distributed file system. Episode is a journaling file system that logs only metadata updates; it logs both the new and the old value of the data being modified, reducing the constraints on the order of writes (since changes can be rolled back); the paper discusses the issues of using and implementing transactions internal to the file system in some detail. Episode supports multiple filesets per volume. It also supports read-only clones of filesets, using block-level copy-on-write in the implementation; during cloning a fileset all inodes are copied, and the COW bit set in every block pointer; when the file system tries to write to a block that has the COW bit set, a new block is allocated and the data is written there (and the pointer to it has a cleared COW bit. The paper gives empirical data comparing Episode to both BSD FFS and JFS, where Episode does quite well (except for CPU time, which the paper explains with missing tuning). The paper also gives results on recovery times, showing that the recovery time varied highly, but correlated with the number of active processes at the time of the crash.} } @InProceedings{roome92, author = {W. D. Roome}, title = {{3DFS}: A Time-Oriented File Server}, booktitle = {Usenix Conference}, pages = {405--418}, year = {1992}, month = {Winter}, OPTannote = {Describes the interface and the implementation of an NFS server that allows access to old versions of the files (the changed files are typically recorded at the granularity of a day). The server uses a WORM jukebox together with a magnetical indexing and cache disk; it can recover from failure of the magnetical disk.} } @InProceedings{hitz+94, author = {Dave Hitz and James Lau and Michael Malcolm}, title = {File System Design for an {NFS} File Server Appliance}, booktitle = {Usenix Conference}, OPTpages = {}, year = {1994}, month = {Winter}, url = {http://www.netapp.com/library/tr/3002.pdf}, annote = {If you want to learn about log-structured file systems, start by reading this paper! It presents the on-disk structure of a no-update-in-place file system (WAFL) in a clear way. The difference to the usual log-structured file systems is that WAFL manages the free space with a block map instead of using segments and a cleaner. Another difference is that WAFL can deal with several snapshots (the block map contains 32 bits per block, with one bit for every snapshot). The file system is for a dedicated NFS server. This server contains NVRAM, allowing quick acknowledgements of writes and allows writing to the disk in large batches (the on-disk structure would result in a large proportion of meta-data writes if data had to be written in small batches). The NVRAM stores the write requests as they come in, not the resulting disk blocks; this maximizes the number of requests that can be stored in the NVRAM; on recovery the requests are simply replayed.} } @InProceedings{ruemmler&wilkes93, author = {Chris Ruemmler and John Wilkes}, title = {{UNIX} Disk Access Patterns}, booktitle = {Usenix Conference}, pages = {405--420}, year = {1993}, month = {Winter}, url = {http://www.hpl.hp.com/personal/John_Wilkes/papers/USENIX.Jan93.ps.Z}, annote = {Presents performance data at the disk driver level about traces on several machines under HP/UX. Most accesses were through a BSD FFS file system, but they also recorded swapping and paging. One interesting result was that increasing the cache hardly reduced read accesses.} } @InProceedings{seltzer+93, author = {Margo Seltzer and Keith Bostic and Marshall Kirk McKusick and Carl Staelin}, title = {An Implementation of a Log-Structured File System for {UNIX}}, booktitle = {Usenix Conference}, pages = {307--326}, year = {1993}, month = {Winter}, critique-url = {http://www.scriptics.com/people/john.ousterhout/seltzer93.html}, annote = {This paper describes the BSD LFS; it gives more details about the on-disk structure of an LFS than \cite{rosenblum&ousterhout92} and should therefore be read first (or better, read \cite{hitz+94} first). The paper also presents some differences from and improvements over Sprite LFS: less memory consumption, dealing with almost-full disks, user-level cleaning, fewer on-disk structures outside files, dealing with directory operations by having non-committing segment summaries instead of introducing special log entries; one disadvantage over Sprite LFS is that BSD LFS does not use the information in segment summaries to roll forward, and therefore needs to write out all dirty metadata on every checkpoint. The paper also presents some performance measurements, comparing BSD LFS with BSD FFS and an improves version of FFS.} } @Article{ousterhout&douglis89, author = {John Ousterhout and Fred Douglis}, title = {Beating the {I/O} Bottleneck: A Case for Log-Structured File Systems}, journal = {Operating Systems Review}, year = {1989}, volume = {23}, number = {1}, pages = {11--28}, month = jan, annote = {Predicts that I/O (in particular, disk seeks during file access) would become a bottleneck, because the CPUs are becoming faster faster than disk seeks; they were certainly right about the speedups, but the file access bottleneck has not happened in my environment yet (1999); we still have systems with 0.08 disks per concurrent user on a 1200~MIPS machine doing software development, whereas the paper predicts requiring 20--80 disks per user on a 500~MIPS machine. The paper discusses a number of solutions for these problems: large file caches for eliminating most reads (this happens on our system), write-back caches with battery backup and alternatively, cache logging, and log-structured file systems. The paper presents log-structured file systems in some depth.} } @InProceedings{douglis&ousterhout89, author = {Fred Douglis and John Ousterhout}, title = {Log-Structured File Systems}, booktitle = {{IEEE COMPCON}}, pages = {124--129}, year = {1989}, annote = {A shorter version of \cite{ousterhout&douglis89}. I recommend reading the longer version.} } @TechReport{dejonge+93, author = {de Jonge, Wiebren and M. Frans Kashoek and Wilson C. Hsieh}, title = {Logical Disk: A Simple New Approach to Improving File System Performance}, institution = {MIT}, number = {LCS/TR-566}, institution2 = {Vrije Universiteit Amsterdam}, number2 = {IR-325}, year = {1993}, note = {A paper on the same topic appeared at SOSP '93}, url = {ftp://ftp-pubs.lcs.mit.edu/pub/lcs-pubs/tr.outbox/MIT-LCS-TR-566.ps.gz}, annote = {The logical disk provides an interface slightly above the device driver interface that provides the performance advantages of log-structured file systems with few changes to the file system. Disadvantages: high RAM consumption for the logical-to-physical map, long recovery times (every segment summary has to be read to recover the map, then the fsck has to be performed), none of the functionality advantages of an LFS. The paper reports performance for an adapted version of the Minix FS.} } @InProceedings{lee+99, author = {Yui-Wah Lee and Kwong-Sak Leung and Mahadev Satyanarayanan}, title = {Operation-based Update Propagation in a Mobile File System}, crossref = {usenix99}, pages = {43--56}, url = {http://www.cs.cmu.edu/afs/cs/project/coda/Web/docdir/hcimd98.pdf}, abstract-url = {http://www.cs.cmu.edu/afs/cs/project/coda/Web/absdir/s15-abstract.html}, annote = {The amount of data transferred for keeping a mobile and a server file system in sync is reduced by transferring operations that change files (e.g., invocations of the compiler), running them on the remote system, comparing the results (with checksums), and only transferring the full file if the results differ.} } @InProceedings{zadok+99, author = {Erez Zadok and Ion Badulescu and Alex Shender}, title = {Extending File Systems Using Stackable Templates}, crossref = {usenix99}, pages = {57--70}, html-url = {http://www.cs.columbia.edu/~ezk/research/wrapfs/}, ps-url = {http://www.cs.columbia.edu/~ezk/research/wrapfs/wrapfs.ps}, annote = {Describes the design and implementation of Wrapfs, which provides a relatively simple interface for adding functionality (e.g., encryption) to file systems.} } @InProceedings{shriver+99, author = {Elizabeth Shriver and Christopher Small and Keith A. Smith}, title = {Why Does File System Prefetching Work?}, crossref = {usenix99}, pages = {71--84}, url = {http://www.bell-labs.com/user/shriver/postscript/prefetching-usenix99.ps} } @InProceedings{brecht&sandhu99, author = {Tim Brecht and Harjinder Sandhu}, title = {The Region Trap Library: Handling Traps on Application-Defined Regions of Memory}, crossref = {usenix99}, pages = {85--99}, url = {http://bbcr.uwaterloo.ca/~brecht/papers/postscript/usenix99.ps}, html-url = {http://bbcr.uwaterloo.ca/~brecht/papers/html/usenix99/paper.html}, annote = {This library allows the user to get MMU traps for arbitrarily-sized regions (not just page-aligned regions) and to change the protection level of regions; it works by mapping (the current version uses copying, but mapping should be a simple optimization) all the regions several times into memory with different protection levels, and swizling pointers to a specific region with a specific protection to point to the copy with the appropriate protection. This requires that pointers to such regions be declared to the library, but the resulting interface looks decent enough.} } @InProceedings{cranor&parulkar99, author = {Charles D. Cranor and Gurudatta M. Parulkar}, title = {The {UVM} Virtual Memory System}, crossref = {usenix99}, pages = {117--130}, annote = {Presents the UVM system for NetBSD and an empirical comparison with the old BSD VM system.} } @InProceedings{miller&myers99, author = {Robert C. Miller and Brad C. Myers}, title = {Lightweight Structured Text Processing}, crossref = {usenix99}, pages = {131--144}, url = {http://www.cs.cmu.edu/~rcm/papers/usenix99/usenix99.html}, annote = {Among other things, this paper contains some algebra-type stuff about dealing with text regions and text constraints.} } @InProceedings{banga+99, author = {Gaurav Banga and Jeffrey C. Mogul and Peter Druschel}, title = {A Scalable and Explicit Event Delivery Mechanism for {UNIX}}, crossref = {usenix99}, pages = {253--265}, url = {http://www.cs.rice.edu/~druschel/usenix99event.ps.gz}, annote = {Discusses the scaling problems of the select() and poll() interfaces for learning about events, proposes a new, better scalable, but also more complicated interface, and evaluates (an implementation of) this interface.} } @InProceedings{deller&heiser99, author = {Luke Deller and Gernot Heiser}, title = {Linking Programs in a Single Address Space}, crossref = {usenix99}, pages = {283--294}, html-url = {http://www.cse.unsw.edu.au/~disy/papers/Deller_Heiser_99/index.html}, url = {http://www.cse.unsw.edu.au/~disy/papers/Deller_Heiser_99/paper.ps.gz}, abstract-url = {http://www.cse.unsw.edu.au/~disy/papers/Deller_Heiser_99/abstract}, annote = {Discusses the issues involved in static and dynamic linking in a normal and a single-address-space OS, and gives some performace data. Recommended reading if you want to learn about linking.} } @InProceedings{nightingale+99, author = {Tycho Nightingale and Yiming Hu and Qing Yang}, title = {The Design and Implementation of a {DCD} Device Driver for {Unix}}, crossref = {usenix99}, pages = {295--307}, url = {ftp://ftp.ele.uri.edu/pub/tycho/USENIX99.ps.gz}, annote = {DCD (Disk Caching Disk) adds journaling at the device driver level (similar to the logical disk \cite{dejonge+93} which adds log-structure at the device driver level). The advantage of this approach is that the file system does not need to be changed, the disadvantage is that we do not get some of the advantages of journaling (e.g., fast crash recovery); the advantage that the paper emphasizes is performance (compared to BSD FFS).} } @InProceedings{anderson&griffioen99, author = {Todd A. Anderson and James Griffioen}, title = {An Application-Aware Data Storage Model}, crossref = {usenix99}, pages = {309--322}, annote = {Proposes that the application should specify how much persistence is wanted for a file; the OS (or, in the paper, a distributed file system) can then ensure that the valuable files are transferred soon to several servers while the easily regenerated files are typically just held locally on the clients (possibly just in RAM), with possible automated help in reconstructing the file.} } @Proceedings{usenix99, title = {Usenix Annual Technical Conference}, booktitle = {Usenix Annual Technical Conference}, year = {1999}, key = {Usenix '99} } @Book{corporaal98, author = {Henk Corporaal}, title = {Microprocessor Architectures -- from VLIW to TTA}, publisher = {John Wiley \& Sons}, year = {1998}, annote = {Transport-triggered architectures make the buses used to transfer data between functional units and register files architecturally visible and programmable. This book gives a comprehensive discussion of TTAs, motivating it, discussing various design issues and alternatives, giving various theoretical and empirical results, and other stuff (e.g., automatic generation of application specific processors by optimization from a template). Compilation techniques are not covered in depth, so you will have to look up the original papers. Recommended.} } @Book{higham98, author = {Nicholas J. Higham}, title = {Handbook of Writing for the Mathematical Sciences}, publisher = {Society for Industrial and Applied Mathematics (SIAM)}, year = {1998}, address = {Philadelphia}, annote = {A nice book teaching many aspects of scientific writing, from basic style issues to writing, publishing, and presenting a paper. There is some discussion specific to mathematical writing, but most of the book is more general in scope; many of the examples have a maths or CS background, though.} } @PhdThesis{brandis95thesis, author = "Marc M. Brandis", title = "Optimizing Compilers for Structured Programming Languages", school = "Institute for Computer Systems, ETH Zurich", year = "1995", type = "{PhD} Dissertation", url = "ftp://ftp.inf.ethz.ch/pub/publications/dissertations/th11024.ps.gz", number = "11024", abstract = "Modern processor architectures rely on optimizing compilers to achieve high performance. Such architectures expose details of their hardware to the compiler, which has to deal with them in generating machine code. This development has led to complex and slow compilers, which are difficult to understand, implement, and maintain. This thesis reports on methods to simultaneously reduce the complexity and the compile-time of optimizing compilers by more than a decimal order of magnitude. It presents a novel intermediate program representation, which integrates data- and control-flow into a single data-structure. This provides not just for simpler and faster optimization algorithms, but also for more powerful optimization techniques. The thesis also describes single-pass algorithms to construct this intermediate program representation from structured source code, as well as single-pass techniques to transform programs with restricted kinds of unstructured control-flow like in Oberon into structured form. The integration of these techniques with the parser allows to implement fast and compact front-ends for structured programming languages, that avoid the many auxiliary data structures other optimizing compilers require. A description of several optimization algorithms and how they can be implemented on this intermediate program representation shows the feasibility of the approach. Most of these techniques have been implemented in a prototypical optimizing compiler translating a subset of the programming language Oberon for the PowerPC architecture. Measurements on this compiler prove that both the complexity and the compile-time of optimizing compilers can be reduced by an order of magnitude when translating a structured programming language and when using this novel intermediate representation and the associated algorithms. The thesis concludes with some feedback to the programming language designers, which language constructs cause undue complications in optimizing compilers and should therefore be omitted.", annote = "Describes an optimizing Oberon compiler using SSA as intermediate representation. This can be even used as a case-study style textbook on advanced compiler construction (at least it impressed me more in this respect than other compiler textbooks I have read recently). One other important aspect in this regard is that the compiler is actually implemented and mostly by one person. Highly recommended." } @Book{appel98, author = {Andrew W. Appel}, title = {Modern Compiler Implementation in C}, publisher = {Cambridge University Press}, year = {1998}, annote = {The C version of the Tiger book. The first half presents the design for a complete compiler for Tiger (a Pascal-like language) and, of course, the techniques used in that design (and some surrounding material); the compiler itself is an exercise. Then the book discusses handling various language variations (e.g., OO and functional programming), and various optimization techniques.} } @InProceedings{chung+00, author = {Yoo C. Chung and Soo-Mook Moon and Kemal Ebcio\u{g}lu and Dan Sahlin}, title = {Reducing Sweep Time for a Nearly Empty Heap}, booktitle = {Symposium on Principles of Programming Languages (POPL'00)}, pages = {378--389}, year = {2000}, annote = {Proposes \emph{selective sweeping}, where the marked objects are sorted by address, then just putting all the gaps into the freelist. This requires asymptotic complexity proportional to the number of live objects. For some benchmarks, this method is slower for smaller heap sizes than the other sweeping method they are using; they propose \emph{adaptive sweeping} that switches between the methods depending on the fraction of live memory. Comments: The sorting done explicitly in selective sweeping is implicit in mark-and-sweep collectors using external bitmaps; however, there clearing the bitmap before each collection takes time proportional to heap size; a more sophisticated data structure than a bitmap could eliminate this. More fundamentally, using a heap that is much larger than the live memory is bad behaviour in a multi-tasking system, so a garbage collector should try to keep the heap size proportianal to live memory anyway, and therefore the live-size vs. heap-size complexity argument is irrelevant. And even if you allow huge heap sizes, the argument becomes irrelevant as soon as you consider the combination of allocation and GC cost (you cannot GC memory that you have not allocated).} } @InProceedings{bala+00, author = {Vasanth Bala and Evelyn Duesterwald and Sanjeev Banerjia}, title = {Dynamo: A Transparent Dynamic Optimization System}, crossref = {sigplan00}, pages = {1--12}, annote = {Dynamo starts by emulating binaries while profiling back-edges. When it decides that a loop header instruction is hot, it generates a fragment (a superblock that can cross call/return boundaries) based on the specific trace executed that time, stores that in the fragment cache, and uses it in the future (until the fragment cache is flushed). It also performs a few optimizations; however, the results indicate that the main performance benefit (on the PA-8000) comes from fragment formation alone, i.e., from the partial inlining and the code layout. The paper presents data taken mostly from the SPEC benchmarks and Dynamo achieves speedups of -2\%--22\% over HP's compiler with -O2. The discussion also indicates that Dynamo loses on programs with run-times $<1$min and on programs without stable working sets. The first problem could be worked around by using such a technique only on binaries that have run for a while.} } @InProceedings{wilken+00, author = {Kent Wilken and Jack Liu and Mark Heffernan}, title = {Optimal Instruction Scheduling Using Integer Programming}, crossref = {sigplan00}, pages = {121--133}, annote = {This paper improves the integer programming methods used for optimal instruction scheduling in several ways, finally getting all the basic blocks from SPECfp95 for a single-issue machine with latencies of 1-3 cycles (with up to about 1000 instructions) to schedule in decent time. The improvements are: DAG transformations like partitioning, eliminating redundant edges, linearizing some regions (the presented method works only for single-issue machines, though); some consistency conditions that should also work for CLP (and maybe dynamic programming) formulations of the problem; and some techniques specific to integer programming (but maybe only attacking the problems inherent in that technique). With all these improvements, they scheduled a block with 1000 instructions, while earlier optimal techniques were limited to about 30 instructions. It remains to be seen, however, how well it does on more complex machine models. It would also be interesting to see how CLP and dynamic programming would benefit from the improvements presented in this paper (at least those that are applicable).} } @InProceedings{yi+00, author = {Qing Yi and Vikram Adve and Ken Kennedy}, title = {Transforming Loops to Recursion for Multi-Level Memory Hierarchies}, crossref = {sigplan00}, pages = {169--181}, annote = {Recursive versions of various array processing algorithms (e.g., dividing matrix multiplication into eight multiplies and four additions of quarter-sized matrices) have nice memory performance characteristics, similar to cache blocking at all cache levels, but they need not be tuned for cache sizes. This paper presents an algorithm that automatically transforms a loop-based program into a recursion-based program, and some impressive performance data for the results.} } @InProceedings{cannarozzi+00, author = {Dante J. Cannarozzi and Michael P. Plezbert and Ron K. Cytron}, title = {Contaminated Garbage Collection}, crossref = {sigplan00}, pages = {264--273}, annote = {Presents a memory reclamation technique that associates a stack frame with each object; it guarantees that the object is dead when the stack frame dies. The frame references in the objects may have to be updated when references change. The method is quite inaccurate (it may keep many dead objects around), and therefore has to be supplemented by a more accurate method (a more conventional garbage collector); the supposed benefit is that the conventional GC has to be invoked fewer times, which may result in an overall speedup if the conventional GC is slower in reclaiming memory than the new approach. The paper presents some data involving JDK 1.1.8 that support this claim, but I am not convinced that this can be generalized (how good is the JDK 1.1.8 GC?).} } @InProceedings{boothe00, author = {Bob Boothe}, title = {Efficient Algorithms for Bidirectional Debugging}, crossref = {sigplan00}, pages = {299--310}, annote = {Describes a debugger that can step backwards in the program. A program compiled for this debugger contains a call to a counter routine for every statement. The slowdown from this is about a factor of 2. If the program wants to step back one statement, it is simply reexecuted until the counter is one less than the current value. The paper also describes stepping back by breakpoints, stepping back over calls, stepping back to the start of the current procedure, and backwards watchpoints. To speed up reexecution, the debugger creates checkpoints (with fork). One complication in reexecution is (I/O) system calls; the debugger does not reexecute them, but returns the same results that they returned upon first execution.} } @Proceedings{sigplan00, booktitle = "SIGPLAN '00 Conference on Programming Language Design and Implementation", title = "SIGPLAN '00 Conference on Programming Language Design and Implementation", year = "2000", key = "PLDI '00" } @InProceedings{oskin+00, author = {Mark Oskin and Frederic T. Chong and Matthew Farrens}, title = {HLS: Combining Statistical and Symbolic Simulation to Guide Microprocessor Designs}, crossref = {isca00}, pages = {71--82}, annote = {They characterize a workload by the usual instruction class usage frequencies (dhrystone-like), cache and branch prediction hit rates (for a specific configuration), and (new) by the dynamic instruction distances (DID), i.e., the number of instructions that a dependence spends. The statistical simulator then generates a random instruction stream with the same characteristics, and executes this on a cycle-accurate simulator, which converges after only a few thousand simulated cycles. For the SPECint95 benchmarks on the SimpleScalar architecture they get IPC values within 7\% of the original SimpleScalar values; lower branch prediction and cache accuracies also lower the accuracy of the statistical simulator. The DID charasteristic seems to perform surprisingly well. The paper then goes on to demonstrate how the technique could be used for exploring design tradeoffs, but without validation from a full simulator I don't trust the results. The paper also discusses the limits of their technique.} } @Proceedings{isca00, title = "$27^\textit{th}$ Annual International Symposium on Computer Architecture", booktitle = "$27^\textit{th}$ Annual International Symposium on Computer Architecture", year = "2000", key = "ISCA 27", } @Article{bailey91, author = {David H. Bailey}, title = {Twelve Ways to Fool the Masses When Giving Performance Results on Parallel Computers}, journal = {Supercomputing Review}, year = {1991}, pages = {54--55}, month = aug, url = {http://www.pdc.kth.se/training/twelve-ways.html}, annote = {Most of these ways are somewhat specific to supercomputing, but you can probably learn lessons applicable to other fields.} } @Article{zelkowitz&wallace98, author = {Marvin V. Zelkowitz and Dolores R. Wallace}, title = {Experimental Models for Validating Technology}, journal = ieeecomputer, year = {1998}, volume = {31}, number = {5}, pages = {23--31}, month = may, annote = {Presents a classification of software engineering validation models and discusses them. They also classified 612 papers in software engineering; the most popular methods are no experimentation (167 papers) and Assertion (ad-hoc validation techniques for the proposed technology, with danger of bias; 192 papers).} } @Article{tichy98, author = {Walter F. Tichy}, title = {Should Computer Scientists Experiment More?}, journal = ieeecomputer, year = {1998}, volume = {31}, number = {5}, pages = {32--40}, month = may, url = {http://wwwipd.ira.uka.de/~tichy/}, annote = {Discusses the role experimentation should play in computer science, and why some of the excuses given for not doing it are invalid.} } @Article{tichy+95, author = {Walter F. Tichy and Paul Lukowicz and Lutz Prechelt and Ernst A. Heinz}, title = {Experimental Evaluation in Computer Science: A Quantitative Study}, journal = {Journal of Systems and Software}, year = {1995}, volume = {28}, number = {1}, pages = {9--18}, month = jan, url = {http://wwwipd.ira.uka.de/~prechelt/Biblio/1994-17.ps.gz}, annote = {Classifies the papers in several journals and conferences (non-CS: Optical Engineering, Neural Computation; CS: TOCS, PLDI, TOPLAS, TSE, and a random smaple of ACM papers) into Theory, Design, Empirical, Hypothesis, and Other papers; for the Design papers it classifies the amount of space dedicated to the empirical validation. They observe that the CS papers have a higher percentage of design articles without empirical validation than the non-CS papers (35\%--55\% vs. $<15$\%). They do some error analysis, some of which is not convincing (e.g., they use a confidence interval of 70\% without justification). The paper is very good on reproducability: it presents the papers used and the classification used in the appendix.} } @InProceedings{johnson00, author = {David S. Johnson}, title = {A Theoretician's Guide to the Experimental Analysis of Algorithms}, booktitle = {Dagstuhl Seminar on Experimental Algorithmics}, year = {2000}, month = sep, note = {An earlier version of \cite{johnson02}}, annote = {Gives much useful advice on producing empirical papers, most of it applicable beyond algorithm work, including lists of pitfalls, suggestions, and pet peeves. One important principle discussed is comparability (as differentiated from reproducability).} } @InProceedings{johnson02, author = {David S. Johnson}, title = {A Theoretician's Guide to the Experimental Analysis of Algorithms}, booktitle = {Proceedings of the 5th and 6th DIMACS Implementation Challenges}, year = {2002}, url = {http://davidsjohnson.net/papers/experguide.pdf}, annote = {Gives much useful advice on producing empirical papers, most of it applicable beyond algorithm work, including lists of pitfalls, suggestions, and pet peeves. One important principle discussed is comparability (as differentiated from reproducability).} } @Article{sima00, author = {Dezs\"o Sima}, title = {The Design Space of Register Renaming Techniques}, journal = ieeemicro, year = {2000}, volume = {20}, number = {5}, pages = {70--83}, month = sep, annote = {Describes various design choices in hardware register renaming and lists the choices taken in many existing CPUs.} } @Article{hookway&herdeg97, author = {Raymond J. Hookway and Mark A. Herdeg}, title = {{DIGITAL FX!32}: Combining Emulation and Binary Translation}, journal = {Digital Technical Journal}, year = {1997}, volume = {9}, number = {1}, pages = {3--12}, url = {http://research.compaq.com/wrl/DECarchives/DTJ/DTJP01/DTJP01P8.PS}, annote = {Among other things, this paper describes software pipelining the emulator loop; it also uses the first two bytes of each instruction as an index into the dispatch table (no word on D-cache miss rate, though).} } @Article{wulf&mckee95, author = {Wm. A. Wulf and Sally A. McKee}, title = {Hitting the Memory Wall: Implications of the Obvious}, journal = can, year = {1995}, volume = {23}, number = {1}, pages = {20--24}, month = mar, url = {ftp://ftp.cs.virginia.edu/pub/techreports/CS-94-48.ps.Z}, annote = {This paper is mentioned frequently, probably because it introduced the term \emph{memory wall}. However, the central argument of the paper is flawed. You can find a longer critique by me at http://www.complang.tuwien.ac.at/anton/memory-wall.html.} } @PhdThesis{piumarta92, author = {Ian K. Piumarta}, title = {Delayed Code Generation in a {Smalltalk-80} Compiler}, school = {University of Manchester}, year = {1992}, url = {http://www.wolczko.com/mushroom/theses/piumarta.ps.gz}, annote = {The main topic seems to be to replace peephole optimization (and it's compile-time cost) with delayed code generation to achieve similar code quality. When compiling a node, instead of generating code for moving the argument described by a node to a register, delayed code generations just creates a descriptor and leaves the optimal code generation to the user of the node. This was not original at the time; e.g., tree parsing code generation (BEG/burg) does this, and even does it in an optimal way.} } @InProceedings{hoffmann&o'donnell79, author = {Christoph M. Hoffmann and Michael J. O'Donnell}, title = {An Interpreter Generator Using Tree Pattern Matching}, booktitle = {Principles of Programming Languages (POPL'79)}, pages = {169--179}, year = {1979}, annote = {This paper sketches many ideas having to do with the equational specification of an interpreter of a functional language and the generation of a tree-parsing interpreter from the specification.} } @InProceedings{card+94, author = {R\'emy Card and Theodore Ts'o and Stephen Tweedie}, title = {Design and Implementation of the Second Extended Filesystem}, booktitle = {Proceedings of the First Dutch International Symposium on Linux}, OPTpages = {}, year = {1994}, isbn = {90-367-0385-9}, url = {http://web.mit.edu/tytso/www/linux/ext2intro.html}, OPTannote = {} } @Book{bell+90, author = {Timothy C. Bell and John G. Cleary and Ian H. Witten}, title = {Text Compression}, publisher = {Prentice-Hall}, year = {1990}, ISBN = {0-13-911991-4}, annote = {A very nice textbook on code compression covering both theory and empirical work, as well as discussing a lot of background topics. Some parts take quite some work to comprehend, though.} } @Manual{intel01, title = {Intel Pentium~4 Processor Optimization}, organization = {Intel}, year = {2001}, OPTnote = {}, OPTannote = {} } @InProceedings{hughes82, author = "R. J. M. Hughes", title = "Super-Combinators", booktitle = "Conference Record of the 1980 LISP Conference, Stanford, CA", pages = "1--11", publisher = "ACM", address = "New York", year = "1982", OPTannote = {} } @Article{hinton+01, author = {Glenn Hinton and Dave Sager and Mike Upton and Darrel Boggs and Doug Carmean and Alan Kyker and Patrice Roussel}, title = {The Microarchitecture of the Pentium~4 Processor}, journal = {Intel Technology Journal}, year = {2001}, month = {Q1}, url = {http://developer.intel.com/technology/itj/q12001/articles/art_2nav.htm}, pdf-url = {http://developer.intel.com/technology/itj/q12001/pdf/art_2.pdf}, OPTannote = {} } @TechReport{klaiber00, author = {Alexander Klaiber}, title = {The Technology Behind {Crusoe} Processors}, institution = {Transmeta Corporation}, year = {2000}, url = {http://www.transmeta.com/pdf/white_papers/paper_aklaiber_19jan00.pdf}, annote = {Gives an overview of Transmeta's processors, in particular code morphing and discusses hardware support for dealing with various problems in code morphing (precise exceptions, aliases, self-modifying code), and power management. For self-modifying code, it write-protects pages that contain code that has been translated; the paper also hints at more sophisticated strategies.} } @InProceedings{ghiya+01, author = {Rakesh Ghiya and Daniel Lavery and David Sehr}, title = {On the Importance of Points-To Analysis and Other Memory Disambiguation Methods For {C} Programs}, crossref = {sigplan01}, pages = {47--58}, annote = {This paper evaluates the effect of various memory disambiguation techniques on the run-time (of SPECint 2000 compiled with the Itanium compiler). Unfortunately they turned off generating code using data speculation techniques (including run-time disambiguation) in all experiments, so the results show higher speedups for static disambiguation techniques than usual on that platform. The paper also gives metrics like average points-to set size, percentages of \emph{independent}, \emph{maybe}, and \emph{dependent} answers to disambiguation queries, and which methods provided the answers. This data allows to see if the indirect metrics used in much of the pointer analysis literature correlate with run-time; these results are mixed, so using indirect metrics alone is not a good idea. The overall speedup from diambiguation techniques is between 2\% (181.mcf) and 26\% (256.bzip2), average 12\%. The intra- and interprocedural points-to analyses provide little speedup over simpler techniques (exception: interprocedural points-to analysis provides about 6\% speedup for 300.twolf); just analysing globale variables for address-taken (interprocedurally) gives significant speedups for 175.vpr, 176.gcc, 254.gap, 256.bzip2 and 300.twolf and ``steals much of the thunder from interprocedural points-to analysis''. One interesting result is that the compiler can benefit significantly from recognizing memory-allocation routines (and conversely, performance can suffer if the compiler does not recognize a user-defined memory-allocation routine).} } @InProceedings{shaham+01, author = {Ran Shaham and Elliot K. Kolodner and Mooly Sagiv}, title = {Heap Profiling for Space-Efficient {Java}}, crossref = {sigplan01}, pages = {104--113}, annote = {Measures, in a Java environment, the drag-time, the time between an object becoming dead (i.e., the last dynamic use) and the object becoming unreachable (and thus garbage-collectable), and how much it can be reduced in various automatic ways.} } @InProceedings{evans&fraser01, author = {William S. Evans and Christopher W. Fraser}, title = {Bytecode Compression via Profiled Grammar Rewriting}, crossref = {sigplan01}, pages = {148--155}, annote = {Introduces a kind of two-level interpretation scheme, where the actual interpreted program code is a linearized representation of a parse tree for a grammar that can generate all the VM code for a basic block that the compiler can generate; so the higher level of the interpreter interprets the derivation, while the lower level interprets the VM instructions given in the grammar rules. The 256 possible bytecodes for selecting the rule in the grammar are utilized by enhancing the grammar with (otherwise redundant) rules that encode frequently occuring sequences of VM code; these rules are produced by inlining rules for non-terminals. This scheme provides more compression potential than ordinary superinstructions because the nonterminal provides additional context, and because the grammar rules can still contain non-terminals (providing the best of both superinstructions and superoperators). This method compresses lcc-generated bytecode down to about 40\%, with an increase of 11KB in interpreter size (most of it as grammar data). The resulting code is executable and does not require extra memory for decompression. The complete executable is about two thirds the size of an lcc-generated native-code executable, but about the same size as a MSVC-optimized executable; it remains to be seen how well code produced by an optimizing compiler would compress.} } @InProceedings{amme+01, author = {Wolfram Amme and Niall Dalton and Jeffery von Ronne and Michael Franz}, title = {Safe{TSA}: A Type Safe and Referentially Secure Mobile-Code Representation Based on Static Single Assignment Form}, crossref = {sigplan01}, pages = {137--147}, annote = {The basic ideas in this representation are: variables are named as the pair (distance in the dominator tree, assignment within basic block); variables are separated by type, with operations referring only to variables of the right type (like integer and FP instructions and registers in assemblers); memory references use types to encode that a null-pointer check and/or a range check has already occured, allowing optimizing these operations; the resulting code is encoded (using text compression methods) in a way that supports only correct code. These ideas are discussed mostly in a general way, with some Java-specifics, but the representation supposedly also supports Fortran95 and Ada95. The representation supports some CSE, but not for address computation operations. The paper also gives numbers on size (usually a little smaller than Java bytecode), and some other static metrics, especially wrt. the effect of optimizations.} } @InProceedings{appel&george01, author = {Andrew W. Appel and Lal George}, title = {Optimal Spilling for {CISC} Machines with Few Registers}, crossref = {sigplan01}, pages = {243--253}, annote = {They divide the usual register allocation problem into allocation and assignment. The allocation part is solved optimally by modeling it as an integer linear programming problem (cost function: cost of spill and fill code and CISC memory instructions); the assignment part is solved by inserting potential parallel copies everywhere and then removing the unnecessary ones by optimistic coalescing \cite{park&moon98}; the parallel copies are sequentialized without extra registers by using the 386 instruction xchg if necessary. The resulting register allocation algorithm takes 30 times longer than the base allocator they compare with, but increases the execution speed of some benchmark programs by 0\%--25.5% on a Pentium~II. The paper also shortly describes some approaches they tried and that did not work out, and gives an insightful comparison with previous work.} } @InProceedings{hanson&proebsting01, author = {David. R. Hanson and Todd A. Proebsting}, title = {Dynamic Variables}, crossref = {sigplan01}, pages = {264--273}, annote = {} } @Proceedings{sigplan01, booktitle = "SIGPLAN '01 Conference on Programming Language Design and Implementation", title = "SIGPLAN '01 Conference on Programming Language Design and Implementation", year = "2001", key = "PLDI '01" } @TechReport{ding&zhong01, author = {Chen Ding and Yutao Zhong}, title = {Reuse Distance Analysis}, institution = {Computer Science department, University of Rochester}, year = {2001}, number = {UR-CS-TR-741}, month = feb, url = {http://www.cs.rochester.edu/u/cding/Documents/Publications/TR741.ps}, annote = {Introduces the term ``reuse distance'' (number of references to distinct other memory items since the last use); presents an efficient algorithm for computing reuse distances (based on the counting method of Bennett and Kruskal, not the stack method of Mattson et~al.); discusses instrumenting programs at the source level to record reuse distances; presnts and discusses reuse distance histograms for six Fortran~77 programs; shows the effect of reuse-driven execution and reuse-based loop fusion on the reuse distances; and presents timings for an incomplete (wrt efficiency) implementation of the algorithm.} } @Unpublished{vandrunen+01, author = {Thomas VanDrunen and Antony L. Hosking and Jens Palsberg}, title = {Reducing Loads and Stores in Stack Architectures}, note = {http://www.cs.purdue.edu/homes/palsberg/draft/vandrunen-hosking-palsberg00.ps.gz}, year = {2001}, annote = {Gives a nice overview of the previous work on stack allocation etc. Then the paper presents a calculus of transformations on straight-line, sequential code (for a JVM-like stack machine). The manuscript of Sepetember 30, 2001 does not yet explain what problem it tries to solve, and does not have a discussion of the result.} } @InProceedings{zibin&gil01, author = {Yoav Zibin and Joseph Gil}, title = {Efficient Subtyping Tests with PQ-Encoding}, crossref = {oopsla01}, pages = {96--107}, annote = {Gives a detailed overview of the type inclusion test problem and the existing solutions. Then it extends the relative numbering approach (where types are represented as intervals of integer numbers) from single inheritance to multiple inheritance. This is not possible in general with a single encoding, because it is not always possible to arrange the type hierarchy in a way that all subtypes of each type are adjacent. So the paper proposes slicing the type graph into subgraphs for which the required property holds, and having several encodings for the types; this costs mainly a little space in each type for holding the different encodings of the type, but no additional instructions in the inclusion test. The paper proposes a method for creating such encodings by using PQ-trees, and evaluates it empirically; it requires a little less space at run-time (e.g. 16KB instead of 39KB for Eiffel4) and a little more compile time than bit-packed encoding (which is based on displays); no run-time or code size numbers are presented. The paper is somewhat hard to read, due to using hardly or not explained terms, notation, and abbreviation, and too much mathematical notation.} } @InProceedings{alpern+01, author = {Bowen Alpern and Anthony Cocchi and Stephen Fink and David Grove}, title = {Efficient Implementation of Java Interfaces: Invokeinterface Considered Harmless}, crossref = {oopsla01}, pages = {108--124}, annote = {Uses selector indexed tables for dispatching interface calls; the twist here is that it does not use the full table indexed by type and selector, nor selector colouring (which cannot be done incrementally) to reduce the size, but instead allows conflicts in the table and uses conflict resolution stubs to resolve them. The result is called interface method tables (IMT). Also discusses how to change some interface calls to virtual calls (virtualization) based on data-flow analysis, and to devirtualize and inline some of these calls, and doing some of that in a guarded way. The empirical evaluation shows a high frequency of virtual calls in benchmarks; many of them can be optimized into guarded virtual calls, either statically, or in a profile-guided way. Different realistic interface dispatch methods usually have less than 5\% impact, but reach 30\% performance variation (in both directions) in some benchmarks.} } @InProceedings{whaley01, author = {John Whaley}, title = {Partial Method Compilation using Dynamic Profile Information}, crossref = {oopsla01}, pages = {166-179}, annote = {Profiles at the basic-block level; upon exceeding a threshol;d, the method containing the basic block is partially (i.e., active blocks only) compiled; this saves compiling about half of the basic blocks, depending on the threshold settings. Among the optimizations performed are partial dead code elimination, and escape analysis (for allocating objects on the stack).} } @InProceedings{suganuma+01, author = {Toshio Suganuma and Toshiaki Yasue and Motohiro Kawahito and Hideaki Komatsu and Toshio Nakatani}, title = {A Dynamic Optimization Framework for a {Java} Just-In-Time Compiler}, crossref = {oopsla01}, pages = {180--194}, annote = {Describes a JVM system that starts out by interpreting JVM code (with a threshold of 2000 executions of a basic block), and then compiles the code per-method in three stages (quick optimization, full optimization, and special optimization (with specialization)). After interpretation, dynamically installed and deinstalled instrumentation is used to get profiles for further optimization.} } @InProceedings{pechtchanski&sarkar01, author = {Igor Pechtchanski and Vivek Sarkar}, title = {Dynamic Optimistic Interprocedural Analysis}, crossref = {oopsla01}, pages = {195--210}, annote = {They deal with some of the weaknesses of interprocedural analysis by using dynamic recompilation; this allows to use optimistic assumption and recompile when they become invalid (e.g., through dynamic class loading). The paper presents an analysis framework and demonstrates it with type analysis for devirtualization and inlining as example application. The results of this optimization are slightly better than pessimistic analysis for method calls, and sometimes a lot better for interface calls.} } @InProceedings{visser01, author = {Joost Visser}, title = {Visitor Combination and Traversal Control}, crossref = {oopsla01}, pages = {270--282}, annote = {Shows how to build general and flexible tree traversals from a few simple combinators.} } @Proceedings{oopsla01, title = {Conference on Object-Oriented Programming, Systems, Languages \& Applications (OOPSLA '01)}, booktitle = {Conference on Object-Oriented Programming, Systems, Languages \& Applications (OOPSLA '01)}, year = {2001}, key = {OOPSLA '98}, } @PhdThesis{sugumar93, author = {Rabin A. Sugumar}, title = {Multi-Configuration Simulation Algorithms for the Evaluation of Computer Architecture Designs}, school = {University of Michigan}, year = {1993}, url = {http://www.eecs.umich.edu/PPP/rabins-thesis.ps}, abstract-url = {http://www.eecs.umich.edu/PPP/rabins-thesis.html}, note = {Technical Report CSE-TR-173-93 with Santosh G. Abraham}, annote = {} } @InProceedings{naessen+01, author = {Henrik N\"ass\'en and Mats Carlsson and Konstantinos Sagonas}, title = {Instruction Merging and Specialization in the {SICStus Prolog} Virtual Machine}, booktitle = {Principles and Practice of Declarative Programming (PPDP01)}, OPTpages = {}, year = {2001}, url = {http://www.csd.uu.se/%7Ekostis/Papers/sicstus.ps.gz}, annote = {Gives an overview of various WAM optimization techniques and then evaluates combining (merging) pairs of instructions into (about 60) superinstructions, specializing WAM instructions for specific immediate arguments (in particular, specific registers, for about 200 new instructions), and a combination of both (for about 100 new instructions). Instruction merging produces small speedups (about 8\% on average), specialization produces a small slowdown on average, and both combined are about as fast as instruction merging alone. VM code size is reduced by around 10\% with these techniques, and the VM emulator size grows by up to 15KB.} } @InProceedings{noel+98, author = {Fran\c{c}ois No\"el and Luke Hornof and Charles Consel and Julia L. Lawall}, title = {Automatic, Template-Based Run-Time Specialization: Implementation and Experimantal Study}, booktitle = {IEEE International Conference on Computer Languages (ICCL '98)}, pages = {123-142}, year = {1998}, url = {http://compose.labri.fr/documentation/papers/rt_bench.ps.gz}, annote = {This paper first gives a nice overview of partial evaluation and specialization, then describes implementation details of the Tempo run-time specializer for C: at compile-time the specializer produces C code for functions containing the templates for the run-time generated code in an unoptimizable context similar to the actual usage of the templates (this ensures that the register allocation etc.\ will be such that the templates fit together). Template boundaries are registered using labels-as-values and function pointers. Holes for filling in constants or calls are created by referencing external variables, and using linkage information (through GNU BFD) to find the addresses of the holes. Finally, the paper shows some benchmark results; for several benchmarks run-time specialization gives 70\%--90\% of the speed of compile-time specialization, but for dot-product and dithering, where the main benefit of compile-time specialization comes from strength reduction, changes in the source code of the benchmark were required to achieve such results (these transformations can probably be automated with additional effort); the break-even point was quite soon for all benchmarks (3--87 iterations), mostly because the run-time specialization is quite quick (just copying templates and filling holes), and most of the work was already done at compile-time.} } @Book{adobe99, author = {{Adobe Systems Incorporated}}, title = {PostScript Language --- Reference Manual}, publisher = {Addison-Wesley}, year = 1999, edition = {third}, url= {http://www.adobe.com/products/postscript/pdfs/PLRM.pdf} } @Book{conklin&rather97, author = {Edward K. Conklin and Elizabeth D. Rather}, title = {Forth Programmer's Handbook}, publisher = {Forth, Inc.}, year = {1997}, isbn = {0-9662156-0-5}, OPTannote = {} } @InProceedings{shaw02, author = {Mary Shaw}, title = {What Makes Good Research in Software Engineering?}, booktitle = {Presented at the European Joint Conference of Theory and Practice of Software (ETAPS 2002), Grenoble, France. To appear in the International Journal on Software Tools for Technology Transfer.}, url = {http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/compose/www/ftp/shaw-fin-etaps.pdf}, year = {2002}, OPTnote = {}, annote = {Takes a look at various research strategies used in software engineering.} } @Article{collberg02, author = {Christian S. Collberg}, title = {Automatic derivation of compiler machine descriptions}, journal = toplas, year = {2002}, volume = {24}, number = {4}, pages = {369--408}, month = jul, annote = {Journal version of \cite{collberg97}. Extracts a machine description (for an assembler-generating code generator) out of a working C compiler by compiling various test programs to assembly language, and analysing the resulting output. Presents various innovative techniques for achieving this result.} } @Article{hoogerbrugge+99, author = "Jan Hoogerbrugge and Lex Augusteijn and Jeroen Trum and Rik van de Wiel", title = "A code compression system based on pipelined interpreters", journal = spe, volume = "29", number = "11", pages = "1005--1023", month = sep, year = "1999", OPTannote= "" } @Book{jones+93, author = {Neil D. Jones and Carsten K. Gomard and Peter Sestoft}, title = {Partial Evaluation and Automatic Program Generation}, publisher = {Prentice Hall}, year = {1993}, url = {http://www.dina.kvl.dk/~sestoft/pebook/}, OPTannote = {} } @TechReport{rossi&sivalingam96, author = {Markku Rossi and Kengatharan Sivalingam}, title = {A Survey of Instruction Dispatch Techniques for Byte-Code Interpreters}, institution = {Faculty of Information Technology, Helsinki University of Technology}, year = {1996}, number = {TKO-C79}, month = may, url = {http://www.cs.hut.fi/~cessu/papers/dispatch.ps}, annote = {Describes a number of interpreter dispatch techniques and compares five dispatch techniques empirically on five different machines (each with a different architecture); unfortunately, the benchmark is hardly described. The most remarkable part about the paper is that it is the first to propose the memcpy method for optimizing dispatch (aka selective inlining \cite{piumarta&riccardi98} or dynamic superinstructions); however, it does not explain how to perform control flow in that method (and their code example looks as if one cannot do it); it mentions problems with non-relocatable code, but does not provide a solution; it also mentions dealing with immediate arguments using a data pointer, but does not explain it in detail. The idea is attributed to Kenneth Oksanen.} } @InProceedings{rakvic+02, author = {Ryan Rakvic and Ed Grochowski and Bryan Black and Murali Annavaram and Trung Diep and John P. Shen}, title = {Performance Advantage of the Register Stack in Intel Itanium Processors}, booktitle = {Workshop on Explicitly Parallel Instruction Computing (EPIC) Architectures and Compiler Techniques}, OPTpages = {}, year = {2002}, url = {http://systems.cs.colorado.edu/EPIC2/papers/s2-1-rakvic.pdf}, annote = {Evaluates the IA64 register stack feature by comparing versions with this feature and various numbers of registers with a (simulated) version of the architecture without this feature on 250M instruction traces of a subset of the SPEC2K integer benchmarks and on Oracle. The register stack engine significantly reduces the number of loads and stores; the save-restore traffic on the benchmarks falls (on average) from 15,25\% to 3.92\% for 96 stack registers. The paper does not take the effect of shrink-wrapping \cite{chow88} into account, but on average, 76\% of the saved registers were used by the actual control flow, so shrink-wrapping could not work wonders anyway. The paper presents results on the cache impact of the presence or absence of the register stack (little overall impact). The overall impact on performance for the Itanium~2-like machine they simulated was 1.7\%--11.9\% (7\% average) speedup with 96 stack registers and slightly better with more. For a similar OoO machine model the speedup was 10.2\% for 96 stack registers and again a little more for more registers. To explain the better speedup of the OoO model, the paper presents data for the vitality (criticality) of loads, and in the OoO model 10\% of the restore loads have a dependence distance of 0 (compared to 2\% on the in-order model).} } @Article{tuomi02, author = {Ilkka Tuomi}, title = {The Lives and Death of {Moore}'s Law}, journal = {First Monday}, year = {2002}, volume = {7}, number = {11}, url = {http://firstmonday.org/issues/issue7_11/tuomi/index.html}, annote = {Takes a closer look at the various versions of Moore's law (as written by Moore and as reported by others), and provides empirical evidence of the untruth of all of these versions. The final section claims that ``references to Moore's Law qualitatively miss the character of development in semiconductor technology and information society''; this section did not convince me, though.} } @Book{warren03, author = {Henry S. {Warren, Jr.}}, title = {Hacker's Delight}, publisher = {Addison-Wesley}, year = {2003}, annote = {A collection of computer arithmetic and bit-fiddling stuff; the style is quite dry, so this is not very appropriate for fun reading, more as a reference work. In any case, every compiler writer and micro-optimizer should know it.} } @InProceedings{hartstein&puzak02, author = {A. Hartstein and Thomas R. Puzak}, title = {The Optimum Pipeline Depth for a Microprocessor}, crossref = {isca02}, pages = {7--13}, annote = {Presents a formula for the performance of a microprocessor when varying the pipeline length; the optimum pipeline length can be derived from this. Unfortunately there are two parameters in the formulae that depend on the microarchitecture and the workload, and these parameters cannot be determined analytically, only empirically. The paper also presents data from runs of a simulator with different pipeline lengths, and different (but hardly specified) workloads. The results match with curves from the formula (after matching for the missing parameters). One interesting result was that the SPEC workloads had a shorter optimum pipeline length than the other workloads used in the paper.} } @InProceedings{hrishikesh+02, author = {M. S. Hrishikesh and Norman P. Jouppi and Keith I. Farkas and Doug Burger and Stephen W. Keckler and Premkishore Shivakumar}, title = {The Optimal Logic Depth per Pipeline Stage is 6 to 8 FO4 Inverter Delays}, crossref = {isca02}, pages = {14--24}, annote = {This paper takes a low-level simulator of the 21264, varies the number of pipeline stages, uses this to run a number of workloads (actually only traces from them), and reports performance results for them. With a latch overhead of about 2 FO4 inverters, the optimal pipeline stage length is about 8 FO4 inverters (with work-load-dependent variations). Discusses various issues involved in quite some depth. In particular, this paper discusses how to pipeline the instruction window design (which has been identified as a bottleneck in earlier papers).} } @InProceedings{sprangle&carmean02, author = {Eric Sprangle and Doug Carmean}, title = {Increasing Processor Performance by Implementing Deeper Pipelines}, crossref = {isca02}, pages = {25--34}, url = {http://www.cs.cmu.edu/afs/cs/academic/class/15740-f03/public/doc/discussions/uniprocessors/technology/deep-pipelines-isca02.pdf}, annote = {This paper starts with the Williamette (Pentium~4) pipeline and discusses and evaluates changes to the pipeline length. In particular, it gives numbers on how lengthening various latencies would affect IPC; on a per-cycle basis the ALU latency is most important, then L1 cache, then L2 cache, then branch misprediction; however, the total effect of lengthening the pipeline to double the clock rate gives the reverse order (because branch misprediction gains more cycles than the other latencies). The paper reports 52 pipeline stages with 1.96 times the original clock rate as optimal for the Pentium~4 microarchitecture, resulting in a reduction of 1.45 of core time and an overall speedup of about 1.29 (including waiting for memory). Various other topics are discussed, such as nonlinear effects when introducing bypasses, and varying cache sizes. Recommended reading.} } @InProceedings{ernst&austin02, author = {Dan Ernst and Todd Austin}, title = {Efficient Dynamic Scheduling Through Tag Elimination}, crossref = {isca02}, pages = {37--46}, annote = {Propose and evaluate two methods to reduce the number of tag comparators necessary for a given instruction window size in an OoO CPU scheduler: Having special tag-reduced slots for instructions where one operand is already available (e.g., because it is an immediate operand); and predicting which tag will be the last one to become available. Both techniques reduce IPC by a little, but reduce the critical path time through the scheduler by more, and also reduce the energy consumption.} } @InProceedings{fields+02, author = {Brian Fields and Rastislav Bodik and Mark D. Hill}, title = {Slack: Maximizing Performance under Technological Constraints}, crossref = {isca02}, pages = {47--58}, annote = {The idea here is to use the slack that is present on some dependence paths to use slower and cheaper (in, e.g. energy consumption) resources. This paper explores this topic quite well. It discusses several slack concepts (local, global, and apportioned) and how to measure them, and presents some results on some SPEC codes. There are quite a lot of instructions that have quite a bit of slack: on average, 75\% of the instructions can be apportioned a slack of 5 cycles or more. The paper discusses how to predict slack to make use of this fact, and evaluates how various 6-wide microarchitectures with such a slack predictor would fare (some of them quite well).} } @InProceedings{kim&smith02, author = {Ho-Seop Kim and James E. Smith}, title = {An Instruction Set and Microarchitecture for Instruction Level Distributed Processing}, crossref = {isca02}, pages = {71--81}, url = {http://www.ece.wisc.edu/~hskim/papers/kimh_ildp.pdf}, annote = {This paper addresses the problems of wide superscalars with communication across the chip and the number of write ports in the register file. The authors propose an architecture (ILDP) with general-purpose registers and with accumulators (with instructions only accessing one accumulator (read and/or write) and one register (read or write); for the accumulators their death is specified explicitly in the instructions. The microarchitecture builds \emph{strands} from instructions working on an accumulator; a strand starts with an instruction writing to an accumulator without reading from it, continues with instructions reading from (and possibly writing to) the accumulator and ends with an instruction that kills the accumulator. Strands are allocated to one out of eight processing elements (PEs) dynamically (i.e., accumulators are renamed). A PE consists of mainly one ALU data path (but also a copy of the GPRs and an L1 cache). They evaluated this architecture by translating Alpha binaries into it, and comparing their architecture to a 4-wide or 8-wide Alpha implementation; their architecture has a lower L1 cache latency, though. The performance of ILDP in clock cycles is competetive, and one can expect faster clocks for ILDP. The paper also presents data for other stuff, e.g. general-purpose register writes, which have to be promoted between strands and which are relatively few.} } @InProceedings{lewis+02, author = {Jarrod A. Lewis and Bryan Black and Mikko H. Lipasti}, title = {Avoiding Initialization Misses on the Heap}, crossref = {isca02}, pages = {183--194}, annote = {Gives measurements on the memory traffic arising from freshly allocated heap areas (23\% with a 2MB cache), and proposes a hardware scheme for avoiding it.} } @Proceedings{isca02, title = "$29^\textit{th}$ Annual International Symposium on Computer Architecture", booktitle = "$29^\textit{th}$ Annual International Symposium on Computer Architecture", year = "2002", key = "ISCA 29", } @TechReport{johnson&ritchie81, author = {Steve C. Johnson and Dennis M. Ritchie}, title = {The C Language Calling Sequence}, institution = {Bell Laboratories}, year = {1981}, type = {Computing Science Technical Report}, number = {102}, html-url = {http://cm.bell-labs.com/cm/cs/who/dmr/clcs.html}, ps-url = {http://cm.bell-labs.com/cm/cs/who/dmr/clcs.ps}, annote = {Detailed discussion of calling conventions in general and C calling conventions in particular} } @InProceedings{ogata+02, author = {Kazunori Ogata and Hideaki Komatsu and Toshio Nakatani}, title = {Bytecode Fetch Optimization for a {Java} Interpreter}, crossref = {asplos02}, pages = {58--67}, annote = {The paper presents a Java Bytecode interpreter for the PowerPC architecture and some optimizations and evaluates them: stack caching (a few variations), position-based handler customization and position-based speculative decoding (software pipelining of the interpreter). Position-based handler customization deals with different alignments of bytecodes by having four states in the interpreter for the different alignments, each state with its own specialized copy of the interpreter. For stack caching they evaluated a fixed one-TOS-register organization with write-through caching (5.6\% speedup over base), and dynamic stack caching with two registers (3 states, 7/9% speedup over base), and used the write-through organization for further experiments; write-through is not compared empirically to write-back. Position-based handler customization buys another 19\%, and software pipelining an additional 3.4\%. The paper also presents results on memory traffic (both I and D).} } @InProceedings{02, author = {}, title = {}, crossref = {asplos02}, pages = {}, annote = {} } @Proceedings{asplos02, title = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-X)", booktitle = "Architectural Support for Programming Languages and Operating Systems (ASPLOS-X)", year = "2002", key = "ASPLOS-X" } @InProceedings{stoddart02, author = {Bill Stoddart}, title = {Efficient ``Reversibility'' with Guards and Choice}, crossref = {euroforth02}, pages = {3--15}, url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/bill.rev.ps.gz}, abstract = {We describe reversibility mechanisms incorporated into a native code Forth used an an intermediate language for a B-GSL compiler. In contrast to our previous work, information preservation is limited to what is needed to implement the B-GSL semantics for non-deterministic choice and guard. Design choices are discussed with reference to the Pentium architecture. The use of guards and choice in Forth is illustrated with the Knight's Tour.} } @InProceedings{stoddart&zeyda02, author = {Bill Stoddart and Frank Zeyda}, title = {Implementing Sets for Reversible Computation}, crossref = {euroforth02}, pages = {16--23}, url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/bill.sets.ps.gz}, abstract = {Sets provide a very general tool for representing information and modelling the behaviour of systems. We consider their implementation and associated problems of garbage collection in the context of reversible computation. We describe our implementation technique, which uses ordered arrays, and discuss scalability of performance.} } @InProceedings{gregg&waldron02, author = {David Gregg and John Waldron}, title = {Primitive Sequences in General Purpose {Forth} Programs}, crossref = {euroforth02}, pages = {24--32}, url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/gregg.ps.gz}, note = {Refereed}, abstract = {Instruction dispatch is responsible for most of the running time of Forth interpreters, especially on modern pipelined processors. Superinstructions are an important optimisation to reduce the number of instruction dispatches. Superinstructions have been used for many years to optimise interpreters, but an open problem is the choice of superinstructions to include in the interpreter. In this paper we propose a number of heuristics for choosing superinstructions, and evaluate them for general purpose Forth programs. We find that static measures of frequency perform well for superinstruction selection. As few as eight superinstructions can reduce the number of instruction dispatches by an average of 15\%, and reductions of up to 45\% are possible with large numbers of superinstructions.} } @InProceedings{ertl02ef, author = {M. Anton Ertl}, title = {The Evolution of Vmgen}, crossref = {euroforth02}, pages = {33--37}, url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ertl.ps.gz}, note = {Slides} } @InProceedings{poial02, author = {Jaanus P{\"o}ial}, title = {Stack Effect Calculus with Typed Wildcards, Polymorphism and Inheritance}, crossref = {euroforth02}, pages = {38}, slides-url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/poial.ps.gz}, abstract-url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/poial.txt}, note = {Abstract in hardcopy proceedings}, abstract = {In early 1990s author introduced a formal stack effect calculus for verification of compilers that translated high level languages (Fortran, Modula) into Forth, see [PST90],[P90a],[P90h]. The calculus was partially applicable to static type checking of Forth programs, but this was not the primary goal these days. Stack effects (formal specifications of input and output parameters for stack operations) were defined using flat type space where different types were considered incompatible and no subtyping or inheritance was allowed. The so called wildcard types were introduced by sets of stack effects, see [P91]. This framework does not suite well with abstract stack machines that use principles of object orientation (see, for example, [AG98] about type checking in Java Virtual Machine). Peter Knaggs and Bill Stoddart improved the type signature algebra and introduced a lot of useful things (type variables, subtyping, reference types, wildcards, etc.), see [SK93], [K93].\par In this presentation a modified framework for type checking is proposed to support typed wildcards and inheritance. Now it is possible to perform little more exact type calculations and express polymorphic operations. Every type symbol has its place in the type hierarchy and, at the same time, it may be treated as a wildcard symbol. Earlier approaches matched wildcards to concrete symbols (resulting in this concrete symbol) or to other wildcards (resulting in a new wildcard); this approach is more general allowing stepwise refinement of types. Not only the type checking is target here, but also the (static) choice of the right version for polymorphic operations (known as method overloading in object oriented languages). Given a type hierarchy, formal specifications for operations and a program we can refine the type signatures in the program according to the context where an operation appears. Experimental implementation of this framework is in progress.} } @InProceedings{ceballos02udp, author = {Federico de Ceballos}, title = {UDP/IP over {Ethernet} for 8-Bit Microcontrollers}, crossref = {euroforth02}, url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-udp.ps.gz}, pdf-url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-udp.pdf}, note = {Late paper, not in hard copy} } @InProceedings{ceballos02qnx, author = {Federico de Ceballos}, title = {Forth for the {QNX} Realtime Platform}, crossref = {euroforth02}, url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-qnx.ps.gz}, pdf-url = {http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ceballos-qnx.pdf}, note = {Late paper, not in hard copy} } @InProceedings{ertl02efb, author = {M. Anton Ertl}, title = {Superinstructions in {Gforth}}, crossref = {euroforth02}, note = {Demonstration only, no paper} } @Proceedings{euroforth02, title = {18th EuroForth Conference}, booktitle = {18th EuroForth Conference}, year = {2002}, key = {EuroForth'02}, editor = {M. Anton Ertl} } @InProceedings{pelc98, author = {Stephen Pelc}, title = {The {MPE} {VFX} {Forth} Code Generator}, booktitle = {EuroForth '98}, year = {1998}, url = {http://dec.bournemouth.ac.uk/forth/euro/ef98/pelc98.pdf}, annote = {Contains very little technical information, but gives a nice example of the resulting code quality.} } @InProceedings{yaghmour&dagenais00, author = {Karim Yaghmour and Mchel R. Dagenais}, title = {Measuring and Characterizing System Behaviour Using Kernel-Event Logging}, crossref = {usenix00}, pages = {13--26}, annote = {Describes the Linux Trace Toolkit (LTT).} } @InProceedings{roselli+00, author = {Drew Roselli and Jacob R. Lorch and Thomas E. Anderson}, title = {A Comparison of File System Workloads}, crossref = {usenix00}, pages = {41--54}, annote = {They collected file system traces from four different environments (3 Unix, 1 WNT), and evaluate them. Some of the more interesting results are: relatively small caches (16MB or so) capture most of the cacheable read traffic; most files are either read-mostly or write-mostly; a write delay of 30s is not very effective at reducing write bandwidth (even ignoring syncs); large files are often accessed randomly.} } @InProceedings{zadok&nieh00, author = {Erez Zadok and Jason Nieh}, title = {FiST: A Language for Stackable File Systems}, crossref = {usenix00}, pages = {55--70}, annote = {Stackable file systems allow features to be added to existing file systems (e.g., engryption, unions, access control). This paper presents a language, library, and system for writing stackable file systems portably (the system currently supports Solaris, FreeBSD, and Linux).} } @InProceedings{seltzer+00, author = {Margo I. Seltzer and Gregory R. Ganger and M. Kirk McKusick and Keith A. Smith and Craig A. N. Soules and Christopher A. Stein}, title = {Journaling vs. Soft Updates: Asynchronous Meta-Data Protection in File Systems}, crossref = {usenix00}, pages = {71--84}, annote = {Compares the features of two different journaling file systems built upon FFS and FFS with soft updates qualitatively, and compares their performance quantitatively (as well as with FFS with synchronous and with asynchronous metadata updates). This paper also explains how Soft Updates deals with the problem of having two changes to the same block, which can lead to having cycles in the dependences of the blocks.} } @InProceedings{wong&seltzer00, author = {Alexander Ya-Li and Margo Seltzer}, title = {Operating System Support for Multi-User, Remote, Graphical Interaction}, crossref = {usenix00}, pages = {183--196}, annote = {Presents measurements on the performance (especially latency, but also bandwidth requirements) of X (on Linux) and WNT TSE when displaying on remote displays. Data are presented on latencies resulting from CPU load and scheduling schemes (interestingly, Linux~2.0.36 with it's simple scheduler performed much better than WNT, which has a complex scheduler for favouring interactive tasks), on memory usage and the effect on latency (no surprises here), and on network performance (here, TSE's RDP protocol proved to be more bandwidth efficient than X and its compressed version LBX; a sufficiently large bitmap cache in the display would a lot (especially in the context of animated GIFs).} } @InProceedings{engelschall00, author = {Ralf S. Engelschall}, title = {Portable Multithreading: The Signal Stack Trick for User-Space Thread Creation}, crossref = {usenix00}, pages = {239--249}, annote = {Explains in detail how to implement a POSIX-compatible thread library with commonly available ANSI C, SUSv2, and POSIX functions. The main problem is the thread initialization. The approach described in the paper is used in the GNU Potrable Threads (Pth) library.} } @InProceedings{brown&patterson00, author = {Aaron Brown and David A. Patterson}, title = {Towards Availability Benchmarks: A Case Study of Software {RAID} Systems}, crossref = {usenix00}, pages = {263--276}, annote = {First discusses the methodology of availability benchmarks (very interesting) and then gives an example of how to apply such a methodology to RAID systems on different OSs (Solaris, Linux, WNT).} } @InProceedings{smaragdakis&wilson00, author = {Yannis Smaragdakis and Paul Wilson}, title = {Performing Replacement in Modem Pools}, crossref = {usenix00}, pages = {277--291}, annote = {The problem discussed here is which supposedly-inactive modem connection to drop if another user calls in and all modems are in use. The paper presents the CIRG (conditional inter-reference gap) algorithm, which performs a little better than LRU.} } @Proceedings{usenix00, title = {Usenix Annual Technical Conference}, booktitle = {Usenix Annual Technical Conference}, year = {2000}, key = {Usenix '00} } @InProceedings{adya+02, author = {Atul Adya and William J. Bolosky and Miguel Castro and Gerald Cermak and Ronnie Chaiken and John R. Douceur and Jon Howell and Jacob R. Lorch and Marvin Theimer and Roger P. Wattenhofer}, title = {{FARSITE}: Federated, Available, and Reliable Storage for an Incompletely trusted Environment}, crossref = {osdi02}, pages = {1--14}, annote = {A distributed file system for WANs consisting of untrusted systems with a design goal of 100,000 machines. The machines are assumed to be typical PC/desktop machines, and the users are assumed to behave like desktop users (in particular, concurrent distributed write access is rare, and usually not very massive). The system is based on groups of byzantine-fault-tolerant machines (up to 1/3rd of the machines can fail without the system failing) for serving subtrees of the global directory tree; these \emph{directory groups} are a central concept in FARSITE. Encrypted files with local caching and log-based update mechanisms are used for individual files, with leases for various access types granted by the directory group. The paper discusses various aspects of the design in depth and provides some performance numbers. Recommended reading.} } @InProceedings{saito+02, author = {Yasushi Saito and Christos Karamanolis and Magnus Karlsson and Mallik Mahalingam}, title = {Taming Aggressive Replication in the {Pangaea} Wide-Area File System}, crossref = {osdi02}, pages = {15--30}, annote = {Pangaea is a distributed file system for a WAN of trusted (as of this paper) PC-type computers. File replicas are used for providing good access performance and availability. The paper discusses in depth how replicas are managed, in particular how updates and replicas are distributed efficiently without requiring special administration. Pangaea uses optimistic update, and thus has the potential for update conflicts. The paper contains a detailed empirical evaluation.} } @InProceedings{muthitacharoen+02, author = {Athicha Muthitacharoen and Robert Morris and Thomer M. Gil and Benjie Chen}, title = {Ivy: A Read/Write Peer-to-Peer File System}, crossref = {osdi02}, pages = {31--44}, annote = {A distributed file system based on the DHash peer-to-peer block storage system, without full trust. The file system is a kind of distributed version of a log-structured file system (cleaning is hardly discussed in the paper), with a file system consisting of several logs, one log per update source; users can decide to use (trust) different logs, resulting in different views of the file system, but cooperating users should use the same view. Snapshots are used to avoid having to traverse the log for older data. The paper discusses various aspects and provides performance data.} } @InProceedings{kumar&li02, author = {Sanjeev Kumar and Kai Li}, title = {Using Model Checking to Debug Device Firmware}, crossref = {osdi02}, pages = {61--74}, annote = {Describes the domain-specific language ESP for writing device-drivers, how it supports model checking, and experiences with model-checking device drivers.} } @InProceedings{musuvathi+02, author = {Madanlal Musuvathi and David Y. W. Park and Andy Chou and Dawson R. Engler and David L. Dill}, title = {{CMC}: A Programatic Approach to Model Checking Real Code}, crossref = {osdi02}, pages = {75--88}, annote = {Describes a model checker for C and experiences in using it. One point that the paper makes is that while the model checker may not terminate in practical time (and thus prove correctness or at least produce an exhaustive list of bugs), it does produce some bug reports. Also, the paper observes that correct programs take much less time to model-check, so after removing the bugs that the model-checker finds, it might terminate.} } @InProceedings{02, author = {}, title = {}, crossref = {osdi02}, pages = {}, annote = {} } @InProceedings{02, author = {}, title = {}, crossref = {osdi02}, pages = {}, annote = {} } @InProceedings{02, author = {}, title = {}, crossref = {osdi02}, pages = {}, annote = {} } @InProceedings{02, author = {}, title = {}, crossref = {osdi02}, pages = {}, annote = {} } @InProceedings{02, author = {}, title = {}, crossref = {osdi02}, pages = {}, annote = {} } @InProceedings{02, author = {}, title = {}, crossref = {osdi02}, pages = {}, annote = {} } @Proceedings{osdi02, title = {Operating Systems Design and Implementation (OSDI '02)}, booktitle = {Operating Systems Design and Implementation (OSDI '02)}, year = {2002}, key = {OSDI '02} } @TechReport{moudgill&moreno96, author = {Mayan Moudgill and Jaime Moreno}, title = {Run-Time Detection and Recovery From Incorrectly Reordered Memory Operations}, institution = {IBM}, year = {1996}, number = {RC20857}, abstract-url = {http://domino.watson.ibm.com/library/CyberDig.nsf/0/12a089effaf3a918852565930072a0db?OpenDocument}, url = {http://domino.watson.ibm.com/library/CyberDig.nsf/papers/12A089EFFAF3A918852565930072A0DB/%24File/8692.ps.gz}, annote = {Propose a new method for the run-time disambiguation of aliases: move the load up across the store(s), and after the last store, load from the address again, and compare the loaded value with the value produced by the moved-up load; if they are equal, no destructive aliasing occured, and there is no need to execute compensation code. This method does not require overhead proportional to loads*stores, unlike \cite{nicolau89}, and also does not require special hardware \cite{gallagher+94}. The paper also discusses how to integrate this transformation in a compiler and gives some results.} } @InProceedings{ramsey03, author = {Norman Ramsey}, title = {Embedding an Interpreted Language Using Higher-Order Functions and Types}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {6--14}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/ramsey.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858571&type=pdf}, abstract = {Using an embedded, interpreted language to control a complicated application can have significant software-engineering benefits. But existing interpreters are designed for embedding into C code. To embed an interpreter into a different language requires a suitable API. Lua-ML is a new API that uses higher-order functions and types to simplify the use of an embedded interpreter. A typical application-program function can be added to a Lua-ML interpreter simply by describing the function's type.} } @InProceedings{liu&moore03, author = {Hanbing Liu and J. Strother Moore}, title = {Executable JVM Model for Analytical Reasoning: A Study}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {15--23}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/liu.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858572&type=pdf}, abstract = {To study the properties of the Java Virtual Machine(JVM) and Java programs, our research group has produced a series of JVM models written in a functional subset of Common Lisp. In this paper, we present our most complete JVM model from this series, namely, M6, which is derived from a careful study of the J2ME KVM[16] implementation.} } @InProceedings{franz+03, author = {Michael Franz and Deepak Chandra and Andreas Gal and Vivek Haldar and Fermin Reig and Ning Wang}, title = {A Portable Virtual Machine Target for Proof-Carrying Code}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {24--31}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/franz.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858573&type=pdf}, abstract = {Virtual Machines (VMs) and Proof-Carrying Code (PCC) are two techniques that have been used independently to provide safety for (mobile) code. Existing virtual machines, such as the Java VM, have several drawbacks: First, the effort required for safety verification is considerable. Second and more subtly, the need to provide such verification by the code consumer inhibits the amount of optimization that can be performed by the code producer. This in turn makes just-in-time compilation surprisingly expensive. Proof-Carrying Code, on the other hand, has its own set of limitations, among which are the sizes of the proofs and the fact that the certified code is no longer machine-independent. In this paper, we describe work in progress on combining these approaches. Our hybrid safe-code solution uses a virtual machine that has been designed specifically to support proof-carrying code, while simultaneously providing efficient just-in-time compilation and target-machine independence. In particular, our approach reduces the complexity of the required proofs, resulting in fewer proof obligations that need to be discharged at the target machine. } } @InProceedings{lattendresse&feeley03, author = {Mario Latendresse and Marc Feeley}, title = {Generation of Fast Interpreters for {Huffman} Compressed Bytecode}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {32--40}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/latendresse.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858574&type=pdf}, abstract = {Embedded systems often have severe memory constraints requiring careful encoding of programs. For example, smart cards have on the order of 1K of RAM, 16K of non-volatile memory, and 24K of ROM. A virtual machine can be an effective approach to obtain compact programs but instructions are commonly encoded using one byte for the opcode and multiple bytes for the operands, which can be wasteful and thus limit the size of programs runnable on embedded systems. Our approach uses canonical Huffman codes to generate compact opcodes with custom-sized operand fields and with a virtual machine that directly executes this compact code. We present techniques to automatically generate the new instruction formats and the decoder. In effect, this automatically creates both an instruction set for a customized virtual machine and an implementation of that machine. We demonstrate that, without prior decompression, fast decoding of these virtual compressed instructions is feasible. Through experiments on Scheme and Java, we demonstrate the speed of these decoders. Java benchmarks show an average execution slowdown of 9%. Compression factors highly depend on the original bytecode and the training sample, but typically vary from 30% to 60%. } } @InProceedings{davis+03, author = {Brian Davis and Andrew Beatty and Kevin Casey and David Gregg and John Waldron}, title = {The Case for Virtual Register Machines}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {41--49}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/davis.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858575&type=pdf}, abstract = {Virtual machines (VMs) are a popular target for language implementers. Conventional wisdom tells us that virtual stack architectures can be implemented with an interpreter more efficiently, since the location of operands is implicit in the stack pointer. In contrast, the operands of register machine instructions must be specified explicitly. In this paper, we present a working system for translating stack-based Java virtual machine (JVM) code to a simple register code. We describe the translation process, the complicated parts of the JVM which make translation more difficult, and the optimisations needed to eliminate copy instructions. Experimental results show that a register format reduces the number of executed instructions by 34.88%, while increasing the number of bytecode loads by an average of 44.81%. Overall, this corresponds to an increase of 2.32 loads for each dispatch removed. We believe that the high cost of dispatches makes register machines attractive even at the cost of increased loads.} } @InProceedings{sullivan+03, author = {Gregory T. Sullivan and Derek L. Bruening and Iris Baron and Timothy Garnett and Saman Amarasinghe}, title = {Dynamic Native Optimization of Interpreters}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {50--57}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/sullivan.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858576&type=pdf}, abstract = {For domain specific languages, "scripting languages", dynamic languages, and for virtual machine-based languages, the most straight-forward implementation strategy is to write an interpreter. A simple interpreter consists of a loop that fetches the next bytecode, dispatches to the routine handling that bytecode, then loops. There are many ways to improve upon this simple mechanism, but as long as the execution of the program is driven by a representation of the program other than as a stream of native instructions, there will be some "interpretive overhead".\par There is a long history of approaches to removing interpretive overhead from programming language implementations. In practice, what often happens is that, once an interpreted language becomes popular, pressure builds to improve performance until eventually a project is undertaken to implement a native Just In Time (JIT) compiler for the language. Implementing a JIT is usually a large effort, affects a significant part of the existing language implementation, and adds a significant amount of code and complexity to the overall code base.\par In this paper, we present an innovative approach that dynamically removes much of the interpreted overhead from language implementations, with minimal instrumentation of the original interpreter. While it does not give the performance improvements of hand-crafted native compilers, our system provides an appealing point on the language implementation spectrum.} } @InProceedings{whaley03, author = {John Whaley}, title = {Joeq: A Virtual Machine and Compiler Infrastructure}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {58-66}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/whaley.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858577&type=pdf}, abstract = {Joeq is a virtual machine and compiler infrastructure designed to facilitate research in virtual machine technologies such as Just-In-Time and Ahead-Of-Time compilation, advanced garbage collection techniques, distributed computation, sophisticated scheduling algorithms, and advanced run time techniques. Joeq is entirely implemented in Java, leading to reliability, portability, maintainability, and efficiency. It is also language-independent, so code from any supported language can be seamlessly compiled, linked, and executed - all dynamically. Each component of the virtual machine is written to be independent with a general but well-defined interface, making it easy to experiment with new ideas. Joeq is released as open source software, and is being used as a framework by researchers all over the world on topics ranging from automatic distributed virtual machines to whole-program pointer analysis.} } @InProceedings{palacz+03, author = {K. Palacz and J. Baker and C. Flack and C. Grothoff and H. Yamauchi and J. Vitek}, title = {Engineering a Customizable Intermediate Representation}, booktitle = {Interpreters, Virtual Machines and Emulators (IVME~'03)}, pages = {67--76}, year = {2003}, url1 = {http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/palacz.ps.gz}, url2 = {http://portal.acm.org/ft_gateway.cfm?id=858578&type=pdf}, abstract = {The Ovm framework is a set of tools and components for building language runtimes. We present the intermediate representation and software design patterns used throughout the framework. One of the main themes in this work has been to support experimentation with new linguistic constructs and implementation techniques. To this end, framework components were designed to be parametric with respect to the instruction set on which they operate. We argue that our approach eases the task of writing new components without sacrificing efficiency.} } @TechReport{bak&griesemer03, author = {Lars Bak and Robert Griesemer}, title = {Interpreting Functions Utilizing a Hybrid of Virtual and Native Machine Instructions}, institution = {US}, year = {2003}, type = {Patent}, number = {6513156 B2}, annote = {This patent describes how to replace some interpreted sequences in JVM code (apparently restricted to straight-line code) with native code (somewhat like \cite{yannikos94}). The native code is apparently generated by macro-expansion of the JVM instructions, like it is done in simple Forth native-code compilers \cite{rose86,paysan91}. There is no explanation how the native code for each JVM instruction is generated. The patent discusses the management of native-code snippets a lot and presents a complex solution, but does not give a rationale for that. No evaluation of the proposed approach is given, and the presentation is pretty bad.} } @TechReport{griesemer01, author = {Robert Griesemer}, title = {Interpreter Generation and Implementation Utilizing Interpreter States and Register Caching}, institution = {US}, year = {2001}, type = {Patent}, number = {6192516 B1}, annote = {This patent describes a JVM interpreter with dynamic stack caching, and how it is generated. The stack cache keeps 0 or 1 stack items in registers; the interesting variation here is that the system has four states for one stack item in registers, one state for each type (int, long, float, double). The interpreter is generated by producing, for each state/instruction combination, a prefix that sets up the stack state, a template that does the main work, and a suffix that dispatches the next instruction (somewhat like vmgen \cite{ertl93,ertl+02}). The native code for these parts is generated in a machine-specific way (through an assembler-in-C++). The paper also mentions how to share code between different implementations of an instruction for different states. The patent presents no evaluation of the approach.} } @InProceedings{griesemer99, author = {Robert Griesemer}, title = {Generation of Virtual Machine Code at Startup}, booktitle = {OOPSLA '99 Workshop on Simplicity, Performance, and Portability in Virtual Machine Design}, year = {1999}, annote = {This paper argues that using ordinary assemblers for writing part of the program has several disadvantages, and proposes generating machine code through an assembler-in-C++ at run-time. The example given is the HotSpot JVM interpreter, which is generated at startup of the JVM in this way. An additional benefit in this context was that the infrastructure for the machine-code generation is also needed for the JIT part of HotSpot.} } @InProceedings{costa99, Author = {Santos Costa, V\'{\i}tor}, Title = {Optimising Bytecode Emulation for {Prolog}}, Booktitle = "LNCS 1702, Proceedings of PPDP'99", Publisher="Springer-Verlag", Month = {September}, Pages="261--267", Year = "1999" } @Article{grant+00, author = {Brian Grant and Markus Mock and Matthai Philipose and Craig Chambers and Susan J. Eggers}, title = {{DyC}: An Expressive Annotation-Directed Dynamic Compiler for {C}}, journal = {Theoretical Computer Science}, year = {2000}, volume = {248}, number = {1--2}, pages = {147--199}, abstract-url = {http://www.cs.washington.edu/research/projects/unisw/DynComp/www/Papers/tr-97-03-03-abstract.html}, url = {http://www.cs.washington.edu/research/projects/unisw/DynComp/www/Papers/tr-97-03-03.ps.gz}, OPTannote = {} } @InProceedings{peng+04, author = {Jinzhan Peng and Gansha Wu and Guei-Yuan Lueh}, title = {Code Sharing among States for Stack-Caching Interpreter}, crossref = {ivme04}, pages = {15--22}, year = {2004}, OPTannote = {} } @InProceedings{vitale&abdelrahman04, author = {Benjamin Vitale and Tarek S. Abdelrahman}, title = {Catenation and Specialization for {Tcl} Virtual Machine Performance}, crossref = {ivme04}, pages = {42--50}, year = {2004}, OPTannote = {} } @Proceedings{ivme04, booktitle = {IVME '04 Proceedings}, title = {IVME '04 Proceedings}, year = {2004}, OPTeditor = {} } @MastersThesis{wu96, author = {Qunyan Wu}, title = {Register Allocation via Hierarchical Graph Coloring}, school = {Michigan Technological University}, year = {1996}, url = {ftp://cs.mtu.edu/pub/carr/qwu.thesis.ps.gz}, annote = {The author compares Hierarchical Graph Coloring \cite{callahan&koblenz91} with Briggs' Graph Colouring \cite{briggs+89} (without Briggs' version of live range splitting). The results indicate that hierarchical graph coloring does worse on most benchmarks. The paper looks at several variations of hierarchical graph coloring to identify the performance impact of the different elements of hierarchical graph coloring: Using different numbers of reserved registers (the more are reserved, the worse the results are for most benchmarks); eliminating tiling in several stages (splitting the program into less tiles usually has a positive effect on the results); the effect of using register preferences (mostly little effect, but occasionally large effects in both directions); and what happens if spill cost is ignored (then hierarchical graph colouring loses because it introduces additional jumps (to get single-entry-single-exit tiles) and moves). In general the evaluation is quite nice and very thorough, but I wonder if adding some postprocessing to eliminate the additional jumps would not have made hierarchical graph coloring look better throughout all variations (but probably not good enough to make it worthwhile).} } @PhdThesis{winkel04, author = {Sebastian Winkel}, title = {Optimal Instruction Scheduling for the Itanium Processor Architecture}, school = {Universit\"at des Saarlandes}, year = {2004}, url = {http://www.dagstuhl.de/files/Proceedings/05/05101/05101.WinkelSebastian.Paper.pdf}, OPTannote = {} } @Article{fu+05, author = {Changqing Fu and Kent Wilken and David Goodwin}, title = {A Faster Optimal Register Allocator}, journal = {Journal of Instruction-Level Parallelism}, year = {2005}, volume = {7}, OPTnumber = {}, OPTpages = {}, month = jan, note = {http://www.jilp.org/vol7/v7paper1.pdf}, url = {http://www.jilp.org/vol7/v7paper1.pdf}, annote = {This paper reports progress over earlier work \cite{goodwin&wilken96} on optimal register allocation with integer (linear) programming: Many of the load points, store points, or deallocation points used in the original model are unnecessary for optimality. A model with fewer such points can be solved faster. The paper presents ways to eliminate most of the redundant points, and evaluates them on the SPEC CPU92 and SPEC CPU2000 benchmarks. It also reevaluates the old model on current hardware with a current ILP solver, and compares the three sets of results, showing that all three components (hardware, the better ILP solver, and the improved model) contribute 1-2 orders of magnitude of speedup. The new optimal register allocator can optimally allocate 98\% of the functions in the SPEC CPU2000 benchmarks and find a near-optimal result for another 1\%, resulting in a dynamic instruction reduction on HP-PA of 5.3\%--19.2\% over a graph colouring register allocator. The boundary between solved and unsolved is for function sizes in the range 90-2000 instruction.} } @InProceedings{makarov04, author = {Vladimir N. Makarov}, title = {Figthting Register Pressure in {GCC}}, booktitle = {{GCC} Developers' Summit 2004}, pages = {85--103}, year = {2004}, url = {http://gcc.fyxm.net/summit/2004/Fighting%20Register%20Pressure.pdf}, annote = {This paper discusses a number of imporvements for the original register allocator of gcc (which is still competetive), and presents results for each of them on the SPEC CPU2000 or CPU95 benchmarks on Pentium~4 or Athlon~MP machines (i.e., they were all implemented and worked). There are so many results that I don't want to summarize all of them here (read the paper); one of the results is that even register-pressure sensitive prepass instruction scheduling led to significant slowdowns.} } @InProceedings{kim&lipasti04, author = {Ilhyun Kim and Mikko H. Lipasti}, title = {Understanding Scheduling Replay Schemes}, booktitle = {10th International Symposium on High Performance Computer Architecture (HPCA'04)}, pages = {198--209}, year = {2004}, url = {http://www.ece.wisc.edu/~ikim/hpca2004ikim.pdf}, annote = {Hardware schedulers have to issue instructions before they know the exact latencies of the instructions they depend on (e.g., latencies can vary because of cache misses). Replay schemes deal with this problem by scheduling instructions early, and canceling and replaying them if their operands are not available. However, the scheduler may already have issues instructions depending on that instruction, and they all have to be canceled and replayed. This paper explores various replay schemes, discusses their strength and weaknesses, and presents empirical results.} } @Misc{mashey2005, author = {John Mashey}, title = {{SPEC} use of Geometric Mean}, howpublished = {Usenet Article <1115972116.172947.194880@g44g2000cwa.googlegroups.com>}, month = may, year = {2005}, url = {http://groups.google.at/group/comp.arch/msg/416e58b5e48c1715}, annote = {Discusses why the geometric mean is appropriate in many cases for aggregating benchmark results (like SPEC) into one number: The statistical distribution of the values is usually log-normal (and the reasons for this are discussed), and the right way to aggregate such values is the geometric mean (which is the arithmetic mean in the log scale).} } @Article{bartley92, author = {David H. Bartley}, title = {Optimizing Stack Frame Accesses for Processors with Restricted Addressing Modes}, journal = spe, year = {1992}, volume = {22}, number = {2}, pages = {101--110}, OPTannote = {} } @Article{liao+96, author = {Stan Liao and Srinivas Devadas and Kurt Kreutzer and Steve Tijang and Albert Wang}, title = {Storage Assignment to Decrease Code Size}, journal = toplas, year = {1996}, volume = {18}, number = {3}, pages = {235--253}, OPTannote = {} } @Manual{bundy+85, title = {The Researcher's Bible}, author = {Alan Bundy and Ben du Bolay and Jim Howe and Gordon Plotkin}, year = {1985}, url = {http://homepages.inf.ed.ac.uk/bundy/how-tos/resbible.html}, annote = {Good advice on the problems on the way to a Ph.D. and how to overcome them. An earlier version is \cite{bundy+84}; this version is being maintained.} } @Article{aycock03, author = {John Aycock}, title = {A Brief History of Just-In-Time}, journal = {ACM Computing Surveys}, year = {2003}, volume = {35}, number = {2}, pages = {97--113}, month = jun, annote = {Gives an overview about the research in systems that generate code at run-time. Contains many references, including some to papers that previously escaped my literature searches.} } @Misc{kuhn05unicode, author = {Markus Kuhn}, title = {{UTF-8} and {Unicode} {FAQ} for {Unix/Linux}}, howpublished = {http://www.cl.cam.ac.uk/\~{}mgk25/\linebreak[0]unicode.html}, year = {2005}, OPTannote = {} } @Misc{pelc&knaggs01widechar, author = {Stephen Pelc and Peter Knaggs}, title = {{ANS} {Forth} and Large Characters}, howpublished = {http://www.mpeforth.com/arena/\linebreak[0]i18n.widechar.v7.PDF}, year = {2001}, OPTannote = {} } @Article{rather85, author = {Elizabeth D. Rather}, title = {Fifteen Programmers, 400 Computers, 36,000 Sensors and {FORTH}}, journal = jfar, year = {1985}, volume = {3}, number = {2}, pages = {46--73}, annote = {Describes how Forth was used in an automation system for the Riyadh airport.} } @Article{ierusalimschy+05, author = {Roberto Ierusalimschy and Luiz Henrique de Figueiredo and Waldemar Celes}, title = {The Implementation of Lua 5.0}, journal = {Journal of Universal Computer Science}, year = {2005}, volume = {11}, number = {7}, pages = {1159--1176}, url = {http://www.tecgraf.puc-rio.br/~lhf/ftp/doc/jucs05.pdf}, annote = {This paper discusses a number of implementation issues in Lua 5.0 (and how they relate to the Lua design goals). It is well-written and does not require familiarity with Lua. The issues are: how values are represented; an optimization of (hash) tables such that arrays are used for a part of the table that is indexed with dense integer keys; how free variables in closures are represented; the implementation of threads and coroutines; and the virtual machine. The virtual machine was changed from a stack-based VM to a ``register''-based one in Lua 5.0 (actually it's an indexed-stack VM, or an IA64-style register stack); the paper gives some reasoning and presents advantages of this VM style (although it seems a little one-sided to me); it also provides timing results: the speedups from the VM change alone are 1.02--1.40 for a selection of the shootout benchmarks, and 2.28 for a microbenchmark. The paper does not say so explicitly, but one reason for the good performance of the new VM over the old stack-based VM is probably that both VMs use the slow switch-based dispatch method (for ANSI C compliance), so the reductions in executed VM instructions from the new VM have more effect than they would have with more efficient dispatch methods.} } @Book{lindholm&yellin97, author = {Tim Lindholm and Frank Yellin}, title = {The Java Virtual Machine Specification}, publisher = {Addison-Wesley}, year = {1997}, edition = {First} } @Book{lindholm&yellin99, author = {Tim Lindholm and Frank Yellin}, title = {The Java Virtual Machine Specification}, publisher = {Addison-Wesley}, year = {1999}, edition = {Second} } @Article{thibault+00, author = {Scott Thibault and Charles Consel and Julia L. Lawall and Renaud Marlet and Gilles Muller}, title = {Static and Dynamic Program Compilation by Interpreter Specialization}, journal = {Higher-Order and Symbolic Computation}, year = {2000}, volume = {13}, number = {3}, pages = {161--178}, month = sep, annote = {The authors use the Tempo specializer to convert several interpreters to compilers; compile-time specialization is used to convert interpreters to ahead-of-time compilers, run-time specialization for JIT compilers. They do this for three byte-code interpreters: Harissa (JVM), Ocaml, and the Berkeley Packet Filter. They have to transform the interpreter into a tail-recursive function so that the specializer can deal with conditional VM branches. Indirect VM branches require a translation table from VM-code addresses to specialized-function pointers; the paper does not tell what changes in the interpreter this required. These introductions of calls require that the VM registers are held in global variables, which causes the biggest slowdown of the resulting code compared to native-code compilers. The authors also applied Tempo to two interpreters described as \emph{structured code interpreters} for domain-specific languages (PLAN-P and GAL); these interpretive systems consume source code, and it is not clear what intermediate representation they use, if any; the paper also does not discuss what changes were applied to these two interpreters to facilitate specialization. For all interpreters, the paper presents encouraging performance data. The paper is nicely written and easy to read.} } @TechReport{arnold+04, author = {Matthew Arnold and Stephen J. Fink and David Grove and Michael Hind and Peter F. Sweeney}, title = {A Survey of Adaptive Optimization in Virtual Machines}, institution = {IBM}, year = {2004}, type = {Research Report}, number = {RC23143 (W0312-097)}, annote = {The title says it all. It is certainly a good place to start if you want to get into the area, even though the writing style is not very exciting and the paper mentions only a few of the older works on the topic, and covers mostly newer work in its 165 references.} } @InProceedings{gagnon&hendren03, author = {Etienne Gagnon and Laurie Hendren}, title = {Effective Inline-Threaded Interpretation of {Java} Bytecode Using Preparation Sequences}, booktitle = {Compiler Construction (CC '03)}, pages = {170--184}, year = {2003}, volume = {2622}, series = {LNCS}, publisher = {Springer}, annote = {This paper is about implementing dynamic superinstructions in the SableVM, and especially about dealing with lazy class initialization in that context. The starting point is a version of the JVM translated into threaded code. Race conditions in quickening are avoided by using an additional field in the instruction. The main contribution of the papers are the preparation sequences: out-of-line threaded-code sequences of simple and non-quick VM instructions, with a REPLACE instruction after the last non-quick instruction that rewrites the GOTO to the out-of-line sequence into the dynamic superinstruction (with everything quickened). SableVM superinstructions do not keep the instruction slots for all original instructions, so the preparation cannot be performed in-line (as is done in the Cacao interpreter). The paper contains empirical results for the overall effect of dynamic superinstructions (a speedup by up to a factor of 2.14 (for compress), and for the effect of using preparation sequences compared to not putting quickenable instructions into dynamic superinstructions (a factor of up to 1.49 (compress)).} } @Book{graham04, author = {Paul Graham}, title = {Hackers \& Painters}, publisher = {O'Reilly}, year = {2004}, annote = {A collection of essays on various topics: US high schools, Startups, Economics, Programming Languages. While I don't buy everything he writes, the essays are interesting and easy to read, and contain interesting ideas.} } @InProceedings{berndl+05, author = {Marc Berndl and Benjamin Vitale and Mathew Zaleski and Angela Demke Brown}, title = {Context Threading: A Flexible and Efficient Dispatch Technique for Virtual Machine Interpreters}, booktitle = {Code Generation and Optimization (CGO)}, pages = {15--26}, year = {2005}, annote = {Dynamic superinstructions with replication decrease the number of indirect branch mispredictions, but can substantially increase the number of I-cache misses. Context threading is a variant of subroutine threading that addresses these problems: The basic technique is a (non-inlining) subroutine-threaded variant of dynamic superinstructions (which can be considered as the inlining variant of context threading): in addition to the subroutine calls, there is still a direct threaded code area for used for inline arguments and for control flow. In order to get rid of the indirect jumps and their mispredictions on control flow, context threading translates virtual machine branches, calls and returns in a special way (in particular, calls and returns use the hardware return stack); they also explore inlining very short VM instruction implementations. The authors evaluate their approach on the Ocaml interpreter and on SableVM on a Pentium~4, PPC~7410, and PPC970; they achieve large reductions in branch mispredictions, and decent speedups. The paper does not discuss the amount of code necessary per CPU or the problems that using an SP-changing call instruction can cause (they mention it for the call optimization, but not for the subroutine threading).} } @InProceedings{hsieh+01, author = {Wilson C. Hsieh and Dawson R. Engler and Godmar Back}, title = {Reserse-Engineering Instruction Encodings}, booktitle = {USENIX Annual Technical Conference}, OPTpages = {}, year = {2001}, url1 = {http://www.cs.utah.edu/~wilson/papers/derive-usenix01.pdf}, url2 = {http://www.cs.utah.edu/~wilson/papers/derive-usenix01.html}, annote = {DERIVE is a tool that takes a description of valid assembler inputs, feeds a subset of the possible inputs to the assembler, and determines feeds them through the assembler, looks at the output, and determines the instruction encodings, which can then be used for JIT compilers. The paper describes how DERIVE works and how it is used. DERIVE has been used to determine the encodings of six architectures (two only partially). The DERIVE code is available on-line.} } @Article{wirth06, author = {Niklaus Wirth}, title = {Good Ideas, Through the Looking Glass}, journal = ieeecomputer, year = {2006}, pages = {28--39}, month = jan, annote = {The author looks at some past technologies in hardware, computer architecture, and programming languages, most of which he considers bad ideas in hindsight. In some cases (e.g., bubble memory), his opinion is probably universally held now, so his review seems kind of pointless; in other cases, his opinion is more controversial (e.g., for functional programming) or contrary to mainstream opinion (for virtual memory), so the paper may be good for inspiring discussions, if nothing else.} } @Article{thompson68, author = {Ken Thompson}, title = {Regular Expression Search Algorithm}, journal = jacm, year = {1968}, volume = {11}, number = {6}, pages = {419--422}, month = jun, annote = {Regular expressions are first translated into postfix representation, then into machine code for the IBM 7094, where the flow diagram looks like a syntax diagram (or a funny variant of the NFA) of the regular expression (it is unclear what control flow actually happens for the $\oplus$ (alternative) node). The paper presents Algol code for the translation, and IBM 7094 assembly code for the resulting output. Supposedly this paper does lazy NFA->DFA conversion, but that is not discussed in the paper; it might be implicit in the 7094 assembly code for CNODE and NNODE. \code{} discusses this in more detail. In any case, this implementation does not seem to retain the DFA states.} } @InProceedings{nethercote&mycroft02, author = {Nicholas Nethercote and Alan Mycroft}, title = {The Cache Behaviour of Large Lazy Functional Programs on Stock Hardware}, booktitle = {Memory Systems Performance (MSP '02)}, pages = {44-55}, year = {2002}, url = {http://www.cs.mu.oz.au/~njn/pubs/cache-large-lazy2002.ps.gz}, annote = {This paper analyses the performance mainly of GHC-compiled Haskell programs on an Athlon using mainly performance counters. The paper looks at varying the GC nursery size (160KB seems to be quite good for most programs on that CPU (256KB exclusive L2 cache); they also varied the initial heap size, and that has different effects on different programs, but the paper gives no explanation for that. They use a simple linear execution cost model ($\mathrm{cycles} = 0.8\mathrm{instructions} + 12\mathrm{D-cache-misses} + 206\mathrm{L2-cache-misses} + 10\mathrm{branch-mispredictions}$), which proves to be surprisingly accurate for most benchmarks; this helps them explain the results: Two significant sources of stalls are L2 cache misses (for a good part write misses in the heap) and branch mispredictions (mainly from indirect branches, which are very frequent in GHC; apparently most calls and returns turn into indirect branches). They also perform a more detailed analysis of the reasons for cache misses based on simulation results. In addition, they also compare with some SML/NJ and C programs; it turns out that the GHC programs have a significantly higher CPI (typically 1.5--3) compared to the SML/NJ (1.2--1.6) and the C (1.1--1.5) programs.} } @Article{heering+90, author = {Jan Heering and Paul Klint and Jan Reekers}, title = {Incremental Generation of Parsers}, journal = {IEEE Transactions on Software Engineering}, year = {1990}, volume = {16}, number = {12}, pages = {1344--1351}, month = dec, annote = {} } @InProceedings{hack+06, author = {Sebastian Hack and Daniel Grund and Gerhard Goos}, title = {Register Allocation for Programs in {SSA}-Form}, booktitle = {Compiler Construction {CC'06}}, pages = {247--262}, year = {2006}, volume = {3923}, series = {LNCS}, publisher = {Springer}, annote = {The present a register allocation algorithm for programs in SSA form. As long as the program is in SSA form, its interference graph is chordal and optimally colourable in polynomial time. So the register allocation algorithm works by spilling, then colouring, then coalescing, and finally getting out of SSA form. The paper discusses at first why the colouring is not the problem in this approach, then discuss the other parts: spilling is done heuristically with something inspired by Belady's algorithm; SSA destruction is done by finally translating the phi functions into sequences of register swaps; finally, the paper talks a lot about coalescing (and colouring), such that many values stay in the same register during phi functions (so SSA destruction has less to do). For this part, the paper also presents empirical results, comparing the heuristic they developed with a version that does not do coalescing, and with a version that uses integer linear programming to find an optimal solution in most cases. Their heuristic is much closer to the optimum.} } @InProceedings{choi+99, author = {Jong-Deok Choi and David Grove and Michael Hind and Vivek Sarkar}, title = {Efficient and Precise Modeling of Exceptions for the Analysis of {Java} Programs}, booktitle = {Program Analysis for Software Tools and Engineering (PASTE'99)}, OPTpages = {}, year = {1999}, OPTnote = {}, annote = {Presents the Factored Control Flow Graph, a representation of the control flow that represents control flow edges resulting from potential exception-throwing instructions (PEIs) more efficiently (basically, it does not treat PEIs as branches for block formation, and creates only one set of exception edges per basic block instead of one per PEI. This helps in reducing the number of basic blocks and control flow edges significantly, which helps compile-time and memory consumption.} } @TechReport{rodeh06, author = {Ohad Rodeh}, title = {B-Trees, Shadowing, and Clones}, institution = {IBM}, year = {2006}, type = {IBM Research Report}, number = {H-0245 (H0611-006)}, month = nov, url = {http://www.cs.huji.ac.il/~orodeh/papers/ibm-techreport/H-0245.pdf}, slides-url = {http://www.cs.huji.ac.il/~orodeh/papers/LinuxFS_Workshop.pdf}, note = {Presented at the 2007 Linux Storage and File Systems Workshop}, annote = {The main focus of this work is making B-Trees work efficiently in copy-on-write-style file systems (called ``shadowing'' in this paper). The paper discusses the problems that popular B-tree variants have in a COW environment (the whole tree might be copied on a single insert or delete operation), and suggests using $b+$-trees and a specific order of processing the nodes in order to alleviate these problems. However, B-Trees are designed to minimize writes in an update-in-place system. Given that in a copy-on-write system we write the whole path from the root to the changed block anyway, I wonder if there is some design that utilizes the additional writes in a useful way. In any case, what I found most interesting in this paper is that it uses reference counts for free-blocks management in the presence of clones. The paper gives performance results for synthetic workloads.} } @InProceedings{pinheiro+07, author = {Eduardo Pinheiro and Wolf-Dietrich Weber and Luiz Andr\'e Barroso}, title = {Failure Trends in a Large Disk Drive Population}, booktitle = {5th USENIX Conference on File and Storage Technologies (FAST '07)}, OPTpages = {}, year = {2007}, month = feb, annote = {Reports data on the correlation of various hard disk properties (in particular SMART output) with their failure probability, based on $>100,000$ drives installed at Google. The paper mentions that drive model, manufacturer and vintage plays a role, but does not give data on model and manufacturer. Utilization and temperature did not play a big role in failure probability (but the drives were not run at really high temperatures (few above 45C). From the SMART data, scan errors, reallocations, offline reallocations and probational counts were significantly correlated with failure probability, whereas seek errors, calibration retries and spin retries had little significance. But on more than half of the failed drives, the four strong indicators mentioned above had no counts.} } @InProceedings{dybvig06, author = {R. Kent Dybvig}, title = {The Development of {Chez Scheme}}, booktitle = {International Conference on Functional Programming (ICFP'06)}, pages = {1--12}, year = {2006}, annote = {Gives a history of Chez Scheme and its precursors, what technical innovations and features went into which version, and who contributed.} } @InProceedings{fisher&shivers06, author = {David Fisher and Olin Shivers}, title = {Static Analysis for Syntax Objects}, booktitle = {International Conference on Functional Programming (ICFP'06)}, pages = {111--121}, year = {2006}, annote = {Presents an s-expression-based system for providing macros to arbitrary languages, including support for static analysis (e.g., type inference).} } @InProceedings{saabas&uustalu07, author = {Ando Saabas and Tarmo Ustalu}, title = {Type Systems for Optimizing Stack-Based Code}, booktitle = {ByteCode 2007 (ETAPS '07 workshop)}, year = {2007}, annote = {Performs a number of simple optimizations on JVM code: dead-code elimination, load-pop elimination, store-load optimization. The interesting parts are that their optimizer is based on a type checker, and that the optimizations are performed across basic blocks (on unstructured code).} } @Article{benton+04, author = {Nick Benton and Luca Cardelli and C\`{e}dric Fournet}, title = {Modern Concurrency Abstractions for {C\#}}, journal = toplas, year = {2004}, volume = {26}, number = {5}, pages = {769--804}, month = sep, annote = {Polyphonic C\# extends C\# with mechanisms for communicating and synchronization between threads: asynchronous methods and chords. Asynchronous methods can be called with a non-blocking call (they are always void). A chord is a method body that has several method heads (at most one synchronous); the body is only executed once all of the heads are called; several chords can contain the same head; selection among several matching chords is indeterministic, as is the order among several possible calls to a method. The theoretical framework behind Polyphonic C\# is the join calculus, but the paper does not delve into that. Instead, it shows with program examples how the new features can be used for various parallel/distributed programming problems. It also discusses how Polyphonic C\# is translated into ordinary C\#, and gives some performance results, mainly showing the cost of various operations in microbenchmarks.} } @InProceedings{russel+06, author = {Francis P. Russell and Michael R. Mellor and Paul H. J. Kelly and Olav Beckmann}, title = {An Active Linear Algebra Library Using Delayed Evaluation and Runtime Code Generation [Extended Abstract]}, booktitle = {Library-Centric Software Design (LCSD'06)}, pages = {5--13}, year = {2006}, url = {http://www.doc.ic.ac.uk/~phjk/Publications/DelayedEvaluationRTCG-LCSD06-ExtendedAbstract.pdf}, proceedingsurl = {http://sms.cs.chalmers.se/bibliography/proceedings/2006-LCSD.pdf}, annote = {This work takes a description of a linear algebra computation, applies some optimizations, generates C++ (?) source code for it at run-time, then compiles, links and executes that code. The optimizations applied before generating source code are loop fusion, array contraction, and liveness analysis (however, the latter is not clear to me; apparently liveness is a statistical property and they use some learning approach to determine it, and they fall back on regenerating the value if they predicted wrongly that it is dead). On the run-time code generation part they perform a caching optimization, but only within a run (caching across runs is mentioned as future work); they give a rather detailed, yet confusing description of the hashing and isomorphism checking they use in caching. The paper presents performance results, showing the effects of their optimizations (which do not help in most cases), and also comparing with the Matrix Template Library; the speedups depend on the benchmark and the specific CPU (they used two Pentium~4 variants, and still got significant performance differences).} } @InProceedings{zhang+07, author = {David Zhang and Qiuyuan J. Li and Rodric Rabbah and Saman Amarasinghe}, title = {A Lightweight Streaming Layer for Multi-Core Execution}, booktitle = {2007 Workshop on Design, Architecture and Simulation of Chip Multi-Processors}, OPTpages = {}, year = {2007}, url = {http://cag.lcs.mit.edu/commit/papers/07/zhang-dascmp07.pdf}, OPTannote = {Describes a low-level library for implementing streaming computations on the Cell platform, and provides some empirical data. The library breaks the program into tasks, that are then scheduled by a scheduler. The tasks are not filters over the entire data, but filters for a specific buffer size; the same filter can run of different data on different cores at the same time if there is data parallelism. The restrictions of Cell lead to some unusual design decisions: In particular, the filters have to be loaded dynamically into the SPEs, and they then have to be used for a long time in order to amortize the loading cost. Probably because of that, they choose a big buffer size of 1MB (apparently some overheads are too large for buffer sizes that fit into SPE's local memory), so they have to put the data in main memory (supposedly this is not a bottleneck for them, but then they don't achieve top performance in other areas. There is an insteresting discussion and empirical comparison of static and dynamic scheduling; it seems that except under special circumstances, dynamic scheduling works better. The paper discusses a lot of implementation stuff, but I still get the feeling that I am missing a lot. However, quite interesting overall, even though I think that the design and implementation is quite Cell-specific. There is also an interesting Related Work section.} } @InProceedings{gummaraju&rosenblum05, author = {Jayanth Gummaraju and Mendel Rosenblum}, title = {Stream Programming on General-Purpose Processors}, booktitle = {38th Annual International Symposium on Microarchitecture (MICRO-38)}, OPTpages = {}, year = {2005}, url = {http://merrimac.stanford.edu/publications/micro38_streamingGPP.html}, annote = {The authors implement a stream programming system on a Pentium~4 with Hyperthreading and empirically compare some applications with conventionally programmed (single-threaded) variants. The applications they use are of the usual scientific type that stream programming is used for. The implementation of stream programming is remarkably similar to that on distributed-memory machines like the Cell: Data is processed by breaking the work into tasks that work on a single buffer (called strip-mining in the paper), using double buffering to allow several tasks to work in parallel, and having a scheduler that determines which task runs on which hardware thread. The buffers (called stream register files (SRFs)) are sized so they all fit into the L2 cache (1MB), with 1-2 ways per set left free for main memory accesses. The programming seems to divide the work mainly between pure memory access (in particular scatter/gather, one such thread per memory stream), and pure computation (which can perform many computations in one kernel); the scheduler tries to schedule a memory task on one thread and a compute task on the other, because that results in better performance, whereas having two memory tasks in the two threads gives worse performance than executing the tasks sequentially. The paper presents microbenchmarks that show this, and presents application benchmarks for streaming and conventional programs that shows that the streaming program can be faster (often a factor 1.2) than the conventional program, but also slower (more than a factor 2 slowdown for one problem size of one benchmark). The explanation for the speedup is that the memory system (e.g., hardware prefetching) works better if it has to work on just one memory stream instead of several intermixed ones as in the conventional code.} } @Book{oram&wilson07, editor = {Andy Oram and Greg Wilson}, title = {Beautiful Code}, publisher = {O'Reilly}, year = {2007}, annote = {This book contains 33 chapters, each by different authors, most discussing a program written by the author of the chapter. Some of the chapters are excellent (e.g., the one by Jon Bentley or the one on Mapreduce), some are at a much lower level (and one of those (by Tim Bray) makes claims and recommendations that are misleading). Most of the chapters, however, discuss a large piece of software, which is a hard task. These chapters often required quite a bit of advance knowledge in the problem solved, and in the method used for solving it; not surprisingly, I found that I skipped much of these chapters, because I found them uninteresting, and/or because I found them too hard to follow. Many of the chapters discuss performance improvements; while I have a soft spot for this topic, I found it overrepresented given the title of the book; but maybe this just means that many programmers find fast code beautiful.} } @Book{valloud08, author = {Andr\'es Valloud}, title = {Hashing in Smalltalk: Theory and Practice}, publisher = {self-published (www.lulu.com)}, year = {2008}, annote = {This book mainly discusses non-cryptographic hash functions, with a bit of background on hashing and hash function testing. The main body of the book is taken up by a discussion of various hash functions found in the literature (chapter 5) and found in Smalltalk implementations (chapter 7). The book has a number of shortcomings: It lacks a literature list with proper references. It is written in a verbose, repetetive, and not very systematic way: in particular, many hash functions are discussed in arbitrary order, instead of categorizing them at a somewhat finer level, and discussing all of the hash functions of one category together; many similar hash functions are discussed at length. To make things worse, there is no executive summary to be found (not even in the conclusions section, which just introduces quality categories, but fails to categorize the hash functions discussed until then). There is also no index (which would help alleviate some of the disadvantages of the verbose style). Each hash function is described with a nice table, but I did not find a definition of the "Hash quality" metric, and the collisions etc. when working on the raw hash function seem to be of little interest. Hash function values modulo $p$ are only evaluated for prime $p$s, but power-of-two $p$s would be more interesting (the author summarily ignores that option because he claims that it leads to too many collisions); it also does not evaluate hash functions wrt all the criteria discussed earlier in the book, e.g., it does not perform avalanche tests. Hash function code is presented in Smalltalk, which makes it more verbose for this purpose than, say, C, and also harder to understand for many readers. Despite all these shortcomings, the book can still be useful for someone who wants an overview and some test results for a buch of hash functions.} } @InProceedings{agakov+06, author = {F. Agakov and E. Bonilla and J. Cavazos and B. Franke and G. Fursin and M. F. P. O'Boyle and J. Thompson and M. Toussaint and C.K.I. Williams}, title = {Using Machine Learning to Focus Iterative Optimization}, booktitle = {Code Generation and Optimization (CGO'06)}, pages = {295--305}, year = {2006}, url = {http://homepages.inf.ed.ac.uk/jcavazos/cgo-2006.pdf}, annote = {Optimization can be improved by trying out different orderings of optimization passes, and selecting the one that gives the best result; however, this approach results in long compile times. This paper attacks this problem by doing an off-line training to find out what works well for the particular target architecture, and then needs much less compile time to achieve results of similar quality. The paper uses machine learning techniques for this.} } @InProceedings{parker+92, author = {D. Scott Parker and Eric Simon and Patrick Valduriez}, title = {SVP -- a Model Capturing Sets, Streams, and Parallelism}, booktitle = {Very Large Data Bases (VLDB'92)}, pages = {115--126}, year = {1992}, annote = {This paper presents some classical data and computation structures (e.g., collections, and divide-and-conquer) in a functional-programming way and discusses how programs organized in that way can be parallelized.} } @InProceedings{koes&goldstein08, author = {David Ryan Koes and Seth Copen Goldstein}, title = {Near-Optimal Instruction Selection on {DAGs}}, booktitle = {Code Generation and Optimization (CGO '08)}, pages = {45--54}, year = {2008}, url = {http://www.cs.cmu.edu/~dkoes/research/dkoes_cgo08.pdf}, annote = {The instruction selection variant dealt with in the paper corresponds to non-normalized tree grammars with only one kind of non-terminal (this is called \emph{tiling} in the paper). The paper provides a proof that this restricted form of tree parsing is still NP-complete for DAGs. The paper also presents NOLTIS, a heuristic improvement over straight-forward DAG parsing, and evaluates it; unfortunately NOLTIS is restricted to the restricted problem discussed above. The evaluation is the most interesting part of the paper: They implement various DAG instruction selection algorithms in LLVM and compare NOLTIS to the original (greedy) LLVM algorithm, plain DAG parsing with two different ways on decomposing DAGs into trees, or without decomposing them into trees, and an optimal algorithm based on integer programming; the comparison metrics are parsing cost, resulting code size (after register allocation etc., and instruction selection speed. NOLTIS is very close to optimal, but takes twice as long as the plain DAG-parsing variants, and the greedy LLVM algorithm is even faster. Among the plain DAG-parsing variants, the one that does not decompose into trees produces the best code, whereas the one that decomposes completely performs worst, even worse than the greedy LLVM algorithm.} } @InProceedings{nethercote&seward07, author = {Nicholas Nevercote and Julian Seward}, title = {Valgrind: A Framework for Heavyweight Dynamic Binary Translation}, booktitle = {Programming Language Design and Implementation (PLDI'07)}, pages = {89--100}, year = {2007}, annote = {Gives a nice overview of the Valgrind framework for binary instrumentation, explains the motivation, requirements, implementation techniques, compares with competing systems and presents data for evaluating various aspects. Valgrind works like a binary translator, disassembling the original program into an IR, then (unlike a translator) the tool-specific part inserts code in the IR, and then the IR is compiled back into machine code. This approach is less efficient than some others, but allows more powerful tools (thus the \emph{heavyweight} part of the title).} } @InCollection{stroustroup01, author = {Bjarne Stroustrup}, title = {Exception safety: concepts and techniques}, booktitle = {Advances in exception handling techniques}, pages = {60--76}, publisher = {Springer LNCS~2022}, year = 2001, url = {http://www.research.att.com/~bs/except.pdf}, annote = {a} } @InProceedings{reddi+07, author = {Vijay Janapa Reddi and Dan Connors and Robert Cohn and Michael D. Smith}, title = {Persistent Code Caching: Exploiting Code Reuse Across Executions and Applications}, booktitle = {Code Generation and Optimization (CGO '07)}, pages = {74--88}, year = {2007}, url = {http://rogue.colorado.edu/draco/papers/cgo-07-persistence.pdf}, annote = {Describes and empirically evaluates persistant code caching as used in the Pin dynamic instrumentation tool. The evaluation starts with same-input caching across executions, and then proceeds to different-input caching, and across-application caching; unsurprisingly, these techniques help; more interesting is that their effectiveness varies quite a lot between applications, and for varying reasons. The design apparently can only deal with one cache per run, leading to some consequences and evaluations that would probably not be done in a system that can support more liberal mixing of persistently cached code. One interesting observation is that the meta-data is bigger than the code.} } @InProceedings{bruening&kiriansky08, author = {Derek Bruening and Vladimir Kiriansky}, title = {Process-Shared and Persistent Code Caches}, booktitle = {Virtual Execution Environments (VEE'08)}, pages = {61--70}, year = {2008}, url = {http://www.burningcutlery.com/derek/docs/procshared-VEE08.pdf}, annote = {An in-depth discussion of various design issues for persistent and shared code caches, with particular concern for security issues. The context it binary instrumentation based on DynamoRio. They produce one cache file for each \emph{module} (shared object file). The code (and much of the data) in these files is read-only and thus shared between processes using the cache; the rest of the data is privately writable. For security, a user uses only caches created by himself or by root (there is some strangeness about the directories in Section 3.2). Most of the design decisions are pretty straightforward, but there are a lot of issues discussed in the paper. The evaluation shows the memory savings due to sharing, and the startup speed advantage due to persistence, both of which are substantial. The paper contains a long \emph{Related Work} section and cites a lot of papers.} } @Article{brauer+08, author = {Johannes Brauer and Christoph Crasemann and Hartmut Krasemann}, title = {Auf dem Weg zu idealen Programmierwerkzeugen -- Bestandsaufnahme und Ausblick}, journal = {Informatik-Spektrum}, year = {2008}, pages = {580--590}, volume = {31}, number = {6}, annote = {A critique of existing programming languages and tools, and suggestions for future programming languages. The content appears more relevant than most other papers of its kind \cite{wirth06}, but it is also much less concrete (especially in the critique part). As for the suggestions, they envision a dynamic language with some (unexplained) provisions for supporting stable and safe end products; they also envision a Smalltalk-style really integrated development environment (instead of the add-on style of Eclipse); and they envision that the language/tool supports a large number of DSLs in order to bridge the semantic gap.} } @InProceedings{prokopski&verbrugge08, author = {Gregory B. Prokopski and Clark Verbrugge}, title = {Compiler-Guaranteed Safety in Code-Copying Virtual Machines}, booktitle = {Compiler Construction (CC'08)}, pages = {163--177}, year = {2008}, publisher = {Springer LNCS 4959}, OPTannote = {} } @InProceedings{otto+09europar, author = {Frank Otto and Victor Pankratius and Walter F. Tichy}, title = {{XJava}: Exploiting Parallelism with Object-Oriented Stream Programming}, booktitle = {Euro-Par '09}, pages = {875--886}, year = {2009}, publisher = {Springer LNCS~5704}, OPTannote = {} } @InProceedings{schroeder+09, author = {Bianca Schroeder and Eduardo Pinheiro and Wolf-Dietrich Weber}, title = {DRAM Errors in the Wild: A Large-Scale Field Study}, booktitle = {SIGMETRICS/Performance '09}, OPTpages = {}, year = {2009}, url = {http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf}, annote = {Presents data on correctable and uncorrectable errors on a large population of servers with ECC at google (data from 2.5 years). There are lots of interesting results, but the most significant ones are that a surprisingly large amount of errors occur (especially correctable ones), and that many DIMMs don't have any error, while some have lots of errors (and more over time unless they are replaced).} } @InProceedings{salverda&zilles07, author = {Pierre Salverda and Craig Zilles}, title = {Dependence-Based Scheduling Revisited: A Tale of Two Baselines}, booktitle = {Sixth Annual Workshop on Duplicating, Deconstructing, and Debunking (WDDD 2007)}, year = {2007}, url = {http://www.ece.wisc.edu/~wddd/2007/papers/wddd_01.pdf}, url2 = {http://www-sal.cs.uiuc.edu/~zilles/papers/lanes.wddd-2007.pdf}, url3 = {https://pharm.ece.wisc.edu/wddd/2007/papers/wddd_01.pdf}, annote = {When the authors simulated the dependence-based scheduling work by Palacharla, Kim, and Smith, they found 30\% lower IPC than a conventional OoO machine, whereas the original simulations only found a 5\% lower IPC. The paper analyses the reasons for this, and provides a number of insights into how hardware schedulers, execution engines, and various features in them interact, and why and how dependence-based scheduling works. The authors' simulation had a number of significant differences from the simulation in the original work: it used a memory disambiguator, 2-cycle load latency (instead of 1-cycle), and a better branch predictor. These changes increase the number of strands available at the same time, and the 8-lane dependence-based machine becomes lane-limited (and instruction fetch stalls waiting for a free lane), so it cannot profit from the improvements or work around the higher latency, whereas a conventional OoO machine can. 24 lanes would be required to bring the IPC disadvantage of the dependence-based machine down to 5\% on the authors' simulator. OTOH, by changing these parts of their simulation to be like the original work, the dependence-based scheduling only had an 11\% IPC disadvantage on an 8-lane machine (much closer to the original 5\%)} } @InProceedings{tseng&patt08, author = {Francis Tseng and Yale N. Patt}, title = {Achieving Out-of-Order Performance with Almost In-Order Complexity}, crossref = {isca08}, pages = {3--12}, url = {http://users.ece.utexas.edu/~tsengf/files/braids08.pdf}, annote = {Divides basic blocks into \emph{braids}, that do not have data flow dependencies from each other. The program is then encoded into these braids: The start of each braid is marked, and register accesses are separated into internal and external registers (where internal registers only live during the braid and are only accessed from within the braid); results can be encoded to go into the internal registers, external registers, or both. The compiler performs this encoding (and had to observe memory dependences), and the front end then just drops each braid in a \emph{braid execution unit} (BEU), which has a two-wide in-order scheduler. A BEU waits for external inputs with a simple busy-bit-vector. A checkpoint mechanism is used to recover from branch mispredictions; only external registers need to be checkpointed; the same mechanism plus reexecution is used to recover from speculation across exceptions. The example architecture has 8~BEUs and an 8-entry external register file, and each BEU has an 8-entry internal register file. When compared to an 8-issue out-of-order machine, the loss of IPC due to the lower scheduling flexibility is 9\% on average. The paper also gives various data on characteristics of braids in their benchmarks (although that appears quite compiler-dependent in basic block formation) and also data on various variations of their braided architecture.} } @Proceedings{isca08, title = "$35^\textit{th}$ Annual International Symposium on Computer Architecture", booktitle = "$35^\textit{th}$ Annual International Symposium on Computer Architecture", year = "2008", key = "ISCA 35", } @Book{liang99, author = {Sheng Liang}, title = {{Java Native Interface}: Programmer's Guide and Specification}, publisher = {Addison-Wesley}, year = {1999} } @Manual{dumse, title = {User Manual Max-FORTH}, OPTkey = {}, author = {Randy M. Dumse}, organization = {New Micros}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTyear = {}, url = {{\url{http://www.newmicros.com/download/appnotes/hc11_FORTH/ummax/MASTER5.TXT}}}, OPTannote = {} } @Article{vandierendonck&bosschere05, author = {Hans Vandierendonck and Koen De Bosschere}, title = {XOR-Based Hash Functions}, journal = ieeetc, year = {2005}, volume = {54}, number = {7}, pages = {800--812}, month = jul, annote = {In XOR-based hash functions every bit of the hash function is computed as the xor of several bits in the (fixed-width) input. This paper presents two theoretical methods for reasoning about XOR-based hash functions: The Null space representation gives an idea of which hash functions are equivalent, and what patterns conflict; and the column space which can be used to reduce fan-in (the focus of the paper is on using these hash functions in various hardware applications, such as caches, branch predictors. and TLBs). The paper also discusses applications of these methods. The paper is expects that the reader is familiar with linear algebra (more familiar than I was when I read it).} } @Article{ducournau11, author = {Roland Ducournau}, title = {Implementing Statically Typed Object-Oriented Programming Languages}, journal = {ACM Computing Surveys}, year = {2011}, volume = {43}, number = {3}, pages = {Article 18}, month = apr, annote = {A survey of implementation techniques for object-oriented languages, in particular for method dispatch, attribute (instance variable) access, and subtype testing. There is a focus on languages like Java, i.e., on incremental techniques and static types, but the survey also discusses techniques for dynamic languages and non-incremental techniques. The coverage and insights of this paper appears quite good; the downside is that it suffers from an overly abstract presentation that is probably hard to understand for readers who are not already familiar with the discussed techniques.} } @article{gil+08, author = {Joseph (Yossi) Gil and William Pugh and Grant E. Weddell and Yoav Zibin}, title = {Two-dimensional bidirectional object layout}, journal = {ACM Trans. Program. Lang. Syst.}, volume = {30}, number = {5}, year = {2008}, issn = {0164-0925}, pages = {1--38}, doi = {http://doi.acm.org/10.1145/1387673.1387677}, publisher = {ACM}, address = {New York, NY, USA}, } @Article{amiel+94, author = "E. Amiel and O. Gruber and E. Simon", title = "Optimizing Multi-Method Dispatch Using Compressed Dispatch Tables", journal = "ACM SIG{\-}PLAN Notices", volume = "29", number = "10", pages = "244--244", month = oct, year = "1994", coden = "SINODQ", ISSN = "0362-1340", bibdate = "Fri Apr 24 18:36:02 MDT 1998", acknowledgement = ack-nhfb, classification = "C4240 (Programming and algorithm theory); C6110J (Object-oriented programming); C6140D (High level languages)", conflocation = "Portland, OR, USA; 23-27 Oct. 1994", conftitle = "Ninth Annual Conference on Object-Oriented Programming Systems, Languages, and Applications. OOPSLA '94", corpsource = "INRIA, Le Chesnay, France", keywords = "C++; coloring; compressed dispatch tables; constant time performance; mono-methods; multi-method dispatch; object oriented programming object oriented programming; object-oriented language; object-oriented languages; object-oriented programming; optimisation; optimization; programming theory; signature analysis", sponsororg = "ACM", treatment = "P Practical; T Theoretical or Mathematical", } @InProceedings{hoelzle&ungar94, author = "Urs Hoelzle and David Ungar", booktitle = "ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI '94)", title = "Optimizing Dynamically-Dispatched Calls with Run-Time Type Feedback", year = "1994", abstract-url = "http://http.cs.ucsb.edu/~urs/type-feedback.html", address = "Orlando, FL", url = "ftp://self.stanford.edu/pub/papers/pldi94.ps.Z", keywords = "Object-oriented, SELF", month = jun, scope = "opt", } @Manual{intel12, title = {Intel 64 and IA-32 Architectures Optimization Reference Manual}, OPTkey = {}, OPTauthor = {}, organization = {Intel}, OPTaddress = {}, OPTedition = {}, month = apr, year = {2012}, note = {Order number 248966-026}, url = {http://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf}, OPTannote = {} } @Article{boute92, author = {Raymond T. Boute}, title = {The {Euclidian} Definition of the Functions div and mod}, journal = toplas, year = {1992}, volume = {14}, number = {2}, pages = {127--144}, month = apr, url = {https://biblio.ugent.be/input/download?func=downloadFile&recordOId=314490&fileOId=452146}, OPTannote = {Discusses various variants of defining div and mod, and advocates a variant he calls Euclidian, where the remainder is never negative, irrespective of the signs of divisor and dividend. The paper shows a few examples of the supposed usefulness, but in my eyes they are curiositoes that one is unlikely to encounter in practice; in particular, there is a long passage about number representations, where Euclisian division gives an advantage if we combine negative base, positive digits, and base-complement representation; this discussion is too much on the mathematical side for my taste. Concerning implementation, the author writes that a sign check on the divisor is required to handle the difference between Euclidian and floored division.} } @TechReport{adve&gharachorloo95, author = {Sarita V. Adve and Kourosh Gharachorloo}, title = {Shared Memory Consistency Models: A Tutorial}, institution = {Digital Western Research Lab}, year = {1995}, type = {WRL Research Report}, number = {95/7}, annote = {Gives an overview of architectural features of shared-memory computers such as independent memory banks and per-CPU caches, and how they make the (for programmers) most natural consistency model hard to implement, giving examples of programs that can fail with weaker consistency models. It then discusses several categories of weaker consistency models and actual consistency models in these categories, and which ``safety net'' (e.g., memory barrier instructions) programmers need to use to work around the deficiencies of these models. While the authors recognize that programmers find it difficult to use these safety nets correctly and efficiently, it still advocates weaker consistency models, claiming that sequential consistency is too inefficient, by outlining an inefficient implementation (which is of course no proof that no efficient implementation exists). Still the paper is a good introduction to the issues involved.} } @InProceedings{daya+14, author = {Bhavya K. Daya and Chia-Hsin Owen Chen and Suvinay Subramanian and Woo-Cheol Kwon and Sunghyun Park and Tushar Krishna and Jim Holt and Anantha P. Chandrakasan and L-Shiuan Peh}, title = {{SCORPIO}: A 36-Core Research-Chip Demonstrating Snoopy Coherence on a Scalable Mesh {NoC} with In-Network Ordering}, crossref = {isca14}, OPTpages = {}, url = {http://projects.csail.mit.edu/wiki/pub/LSPgroup/PublicationList/scorpio_isca2014.pdf}, annote = {The cores on the chip described in this paper access their shared memory in a sequentially consistent manner; what's more, the chip provides a significant speedup in comparison to the distributed directory and HyperTransport coherence protocols. The main idea is to deal with the ordering separately from the data, in a distributed way. The ordering messages are relatively small (one bit per core). For details see the paper.} } @Proceedings{isca14, title = "$41^\textit{st}$ Annual International Symposium on Computer Architecture", booktitle = "$41^\textit{st}$ Annual International Symposium on Computer Architecture", year = "2014", key = "ISCA 2014", } @Article{jones06, author = {Derek M. Jones}, title = {Developer Beliefs about Binary Operator Precedence}, journal = {C Vu}, year = {2006}, volume = {18}, number = {4}, pages = {14--21}, month = aug, url = {http://www.knosof.co.uk/cbook/accu06a.pdf}, OPTannote = {The author performed an experiment where he gave a number of expressions containing two random operators (from the C family) without parenthesesis to the test subject and they had to insert parenthesis such that the meaning would not change. The number of mistakes was surprisingly large (66\% correct; 50\% would be the result of guessing), and the paper presents some more detailed statistical analyses of the results, as well as some statistics about the occurence of operators in real-world code.} } @InProceedings{dietz+12, author = {Will Dietz and Peng Li and John Regehr and Vikram Adve}, title = {Understanding Integer Overflow in {C}/{C++}}, booktitle = {34th International Conference on Software Engineering (ICSE)}, OPTpages = {}, year = {2012}, url = {http://www.cs.utah.edu/~regehr/papers/overflow12.pdf}, OPTannote = {} } @InProceedings{wang+12, author = {Xi Wang and Haogang Chen and Alvin Cheung and Zhihao Jia and Nickolai Zeldovich and M. Frans Kaashoek}, title = {Undefined Behavior: What Happened to My Code?}, booktitle = {Asia-Pacific Workshop on Systems (APSYS'12)}, OPTpages = {}, year = {2012}, url1 = {http://homes.cs.washington.edu/~akcheung/getFile.php?file=apsys12.pdf}, url2 = {http://people.csail.mit.edu/nickolai/papers/wang-undef-2012-08-21.pdf}, OPTannote = {} } @Manual{c99rationale, title = {Rationale for International Standard---Programming Languages---C}, key = {C}, OPTauthor = {}, OPTorganization = {}, OPTaddress = {}, edition = {Revision 5.10}, OPTmonth = {}, year = {2003}, url = {http://www.open-std.org/jtc1/sc22/wg14/www/C99RationaleV5.10.pdf}, OPTannote = {} } @InProceedings{rohou+15, author = {Erven Rohou and Bharath Narasimha Swamy and Andr\'e Seznec}, title = {Branch Prediction and the Performance of Interpreters --- Don't Trust Folklore}, booktitle = {Code Generation and Optimization (CGO)}, OPTpages = {}, year = {2015}, url = {https://hal.inria.fr/hal-01100647/document}, annote = {Evaluates the indirect branch predictors of Nehalem, Sandy Bridge, and Haswell and the ITTAGE predictor on the interpreters of Python, Spidermonkey (JavaScript), and a (.NET) CLI interpreter running a number of benchmarks. Haswell and ITTAGE are very good at branch prediction for these benchmarks, and they suggest that switch-based interpreters good enough because of that. However, my own results \url{news:<2015Sep7.142507@mips.complang.tuwien.ac.at>} show that shared dispatch branches still can produce a significant slowdown.} } @InProceedings{holladay+07, author = {Kenneth Holladay and Kay Robbins and Jeffery von Ronne}, title = {{FIFTH\textsuperscript{TM}}: A Stack Based {GP} Language for Vector Processing}, booktitle = {Genetic Progamming (EuroGP)}, year = {2007}, editor = {Marc Ebner et~al.}, pages = {102--113}, publisher = {Springer LNCS~4445}, OPTannote = {} } @Manual{forth2012, title = {Forth Standard 2012}, organization = {Forth~200x Standardization Committee}, year = {2014}, url = {http://www.forth200x.org/documents/forth-2012.pdf} } @TechReport{ivanov+18, author = {Vladimir Ivanov and Razvan Lupusoru and Paul Sandoz and Sandhya Viswanathan}, title = {{JEP} draft: Vector {API}}, institution = {OpenJDK}, year = {2018}, url = {http://openjdk.java.net/jeps/8201271}, OPTannote = {} } @InProceedings{gal+06, author = {Andreas Gal and Christian W. Probst and Michael Franz}, title = {{HotpathVM}: An Effective {JIT} Compiler for Resource-Constrained Devices}, booktitle = {Virtual Execution Environments (VEE'06)}, year = {2006}, pages = {144--153}, url = {http://static.usenix.org/events/vee06/full_papers/p144-gal.pdf}, OPTannote = {} } @Book{wolfe96, author = {Michael Wolfe}, title = {High-Performance Compilers for Parallel Computing}, publisher = {Addison-Wesley}, year = {1996}, OPTannote = {} } @Article{allen&kennedy87, author = {Randy J. Allen and Ken Kennedy}, title = {Automatic Translation of {Fortran} Programs to Vector Form}, journal = toplas, year = {1987}, volume = {9}, number = {4}, pages = {491--542}, OPTmonth = oct, OPTannote = {} } @Article{knuth64, author = {Donald Knuth}, title = {Man or Boy?}, journal = {Algol Bulletin}, year = {1964}, pages = {7}, month = jul, url = {http://www.chilton-computing.org.uk/acl/applications/algol/p006.htm}, OPTannote = {} } @Book{fischer&leblanc88, author = "Charles N. Fischer and Richard J. LeBlanc", title = "Crafting a Compiler", publisher = "Benjamin/Cummings", year = "1988", address = "Menlo Park, CA", } @InProceedings{vonthun01, author = {Manfred von Thun}, title = {Joy: Forth's Functional Cousin}, booktitle = {EuroForth 2001 Conference Proceedings}, OPTpages = {}, year = 2001, url = {http://www.complang.tuwien.ac.at/anton/euroforth/ef01/thomas01a.pdf} } @InProceedings{pestov+10, title = "Factor: a dynamic stack-based programming language", author = "Sviatoslav Pestov and Daniel Ehrenberg and Joe Groff", booktitle = "Proceedings of the 6th Symposium on Dynamic Languages, DLS 2010, October 18, 2010, Reno, Nevada, USA", publisher = "ACM", year = "2010", editor = "William D. Clinger", OPTISBN = "978-1-4503-0405-4", pages = "43--58", OPTURL = "http://doi.acm.org/10.1145/1869631.1869637", } @Manual{cray77, title = {Cray-1 Computer System --- Hardware Reference Manual}, organization = {Cray Research, Inc.}, year = {1977}, url = {http://ed-thelen.org/comp-hist/CRAY-1-HardRefMan/CRAY-1-HRM.html}, OPTannote = {} } @InProceedings{keep+12, title = "Optimizing Closures in {O}(0) Time", author = "Andrew W. Keep and Alex Hearn and R. Kent Dybvig", bibdate = "2015-05-09", bibsource = "DBLP, http://dblp.uni-trier.de/db/conf/icfp/scheme2012.html#KeepHD12", booktitle = "Proceedings of the 2012 Annual Workshop on Scheme and Functional Programming, Scheme 2012, Copenhagen, Denmark, September 9-15, 2012", publisher = "ACM", year = "2012", xbooktitle = "Scheme@ICFP", editor = "Olivier Danvy", ISBN = "978-1-4503-1895-2", pages = "30--35", URL = "http://doi.acm.org/10.1145/2661103", urlwithoutbibliography = "https://www.cs.indiana.edu/~dyb/pubs/closureopt.pdf", OPTannote = "Describes a number of optimizations for a flat-closure implementation of Scheme." } @TechReport{cardelli83, author = "L. Cardelli", title = "The Functional Abstract Machine", institution = "AT\&T Bell Labs", number = "TR-107", year = "1983", keywords = "FP, functional programming, SML, FAM, interpreter, implementation", url = "https://karczmarczuk.users.greyc.fr/matrs/Maitrise/fam.pdf", OPTannote = "A long (45 pages) and detailed description of an abstract machine. Supposedly \cite{keep+12} this paper introduced the notion of flat closures, but if so, it does it in a clandestine way." } @PhdThesis{dybvig87, author = "R. Kent Dybvig", school = "University of North Carolina at Chapel Hill", title = "Three Implementation Models for Scheme", year = "1987", url = "http://agl.cs.unm.edu/~williams/cs491/three-imp.pdf", brokenURL = "ftp://ftp.cs.indiana.edu/pub/scheme-repository/txt/3imp.ps.Z", month = apr, OPTannote = "Introduces flat closures under the name \emph{display closures} in Section 4.4.2" } @InProceedings{lynas&stoddart06, author = {Angel Robert Lynas and Bill Stoddart}, title = {Adding {Lambda} Expressions to {Forth}}, booktitle = {22nd EuroForth Conference}, year = {2006}, pages = {27--39}, url = {http://www.complang.tuwien.ac.at/anton/euroforth2006/papers/lynas.pdf}, OPTnote = {not refereed}, abstract = {We examine the addition of Lambda expressions to Forth. We briefly review the Lambda calculus and introduce a postfix version of Lambda notation to guide our approach to a Forth implementation. The resulting implementation provides the basic facilities of an early binding functional language, allowing the treatment of functions as first-class objects, manipulation of anonymous functions, and closures.} } @ARTICLE{canella+18, author = {{Canella}, C. and {Van Bulck}, J. and {Schwarz}, M. and {Lipp}, M. and {von Berg}, B. and {Ortner}, P. and {Piessens}, F. and {Evtyushkin}, D. and {Gruss}, D.}, title = "{A Systematic Evaluation of Transient Execution Attacks and Defenses}", journal = {ArXiv e-prints}, archivePrefix = "arXiv", eprint = {1811.05441}, primaryClass = "cs.CR", keywords = {Computer Science - Cryptography and Security}, year = 2018, month = nov, url = {https://arxiv.org/abs/1811.05441v1}, adsurl = {http://adsabs.harvard.edu/abs/2018arXiv181105441C}, adsnote = {Provided by the SAO/NASA Astrophysics Data System}, annote = {This paper gives a (pretty tight) survey and provides a system for categorizing the various (potential or real) vulnerabilities. Based on this system, the authors also explore a number of additional potential vulnerabilities and find that some are real and most are not. The paper also gives a survey of defenses against these attacks, but that part appears less systematic to me (maybe it just needs some more work, and we will see that work in a future version). What I find less than optimal is that they bunch the results together by manufacturer, instead of listing them per-microarchitecture (and as Meltdown(-US) on ARM shows, different microarchitectures of the same manufacturers can behave differently); in particular, for the "no CPU is vulnerable" results (Table 5), the tested CPUs should be mentioned for completeness.} } @Article{moore74, author = {Charles H. Moore}, title = {Forth: A new way to Program a Mini-Computer}, journal = {Astron. Astrophys. Suppl.}, year = {1974}, volume = {15}, pages = {497--511}, url = {http://adsbit.harvard.edu/cgi-bin/nph-iarticle_query?1974A%26AS...15..497M&defaultprint=YES&filetype=.pdf}, OPTnote = {}, OPTannote = {} } @Article{smith80, author = {Robert L. Smith}, title = {A Modest Proposal for Dictionary Headers}, journal = {Forth Dimensions}, year = {1980}, volume = {I}, number = {5}, pages = {49}, OPTmonth = jan, OPTnote = {}, annote = {Proposes a header arrangement where the name string is backwards from the count field, in backwards order, for efficiently dealing with variable-length names and \texttt{width}.} } @Article{ungar91, author = {David Ungar and Craig Chambers and Bay-Wei Chang and Urs H{\"o}lzle}, title = {Organizing Programs Without Classes}, journal = {Lisp and Symbolic Computation}, year = {1991}, volume = {4}, number = {3}, pages = {223--242}, url = {http://bibliography.selflanguage.org/_static/organizing-programs.pdf}, annote = {Discusses various organizational issues of programs in Self. Unfortunately, it assumes knowledge that I do not have, so in the end I did not get much from the paper.} } @InProceedings{borning86, author = {Alan Borning}, title = {Classes versus Prototypes in Object-Oriented Languages}, booktitle = {ACM/IEEE Fall Joint Computer Conference}, year = {1986}, pages = {36--40}, url = {ftp://ftp.cs.washington.edu/pub/constraints/papers/fjcc-86.pdf}, annote = {Presents the problems of class-based object-oriented programming, in particular the complexity of metaclasses in Smalltalk. Presents prototyping as solution. Prototyping is defined somewhat formally through constraints, and then demonstrated in examples and implementation is discussed.} } @TechReport{ross06, author = {Kenneth A. Ross}, title = {Efficient Hash Probes on Modern Processors}, institution = {IBM}, year = {2006}, type = {IBM Research Report}, number = {RC24100}, url = "https://domino.research.ibm.com/library/cyberdig.nsf/papers/DF54E3545C82E8A585257222006FD9A2/\$File/rc24100.pdf", annote = {Proposes splash tables, i.e., bucketized cuckoo hash tables. A seeming disadvantage is that cuckoo hashing requires to cache line accesses, but they are independent, and therefore can be performed in parallel. The paper describes how to avoid branch mispredictions with unrolling and SIMD instructions. The resulting hash tables can be very highly loaded (load factor 0.976 for buckets containing 4 entries and 2 alternative locations per key). Splash tables are compared with chained bucket hashing and quadratic probing on the Pentium 4 and a Cell SPE for various table sizes. The probe time (with 32-bit keys and 32-bit values) is 45 cycles or more (for larger hash tables) on the Pentium 4, and 21 cycles on the Cell SPE; the paper also has results for longer keys/values and an Opteron.} } @InProceedings{askitis09, author = {Nikolas Askitis}, title = {Fast and Compact Hash Tables for Integer Keys}, booktitle = {Australasian Computer Science Conference (ACSC 2009)}, year = {2009}, pages = {101--110}, url = {http://crpit.com/confpapers/CRPITV91Askitis.pdf}, annote = {Gives a pretty comprehensive background discussion on hash table variants (resulting in 1.5 pages of references) and benchmarks the following implementations against each other with 32-bit keys: linear probing, array hash tables (like separate chaining, but with reallocated arrays instead of linked lists), separate chaining, clustered separate chaining (more than one entry per linked list element), and bucketized cuckoo hashing. Two data sets were used, a skewed one and a distinct one. In both, linear probing looks best. When looking at the paper, note that the graphs' baselines are not at 0, so differences look inflated.} } @Article{koch15, author = {Matthias Koch}, title = {{Flags, Konstantenfaltung und Optimierungen}}, journal = {Vierte Dimension}, year = {2015}, volume = {31}, number = {arm}, pages = {16--18}, OPTmonth = {}, url = {https://wiki.forth-ev.de/lib/exe/fetch.php/vd-archiv:4d2015-arm.pdf}, OPTannote = {} } @Manual{ mpe16, title = {VFX Forth for x86/x86 64 Linux}, key = {MPE}, optauthor = {}, organization = {Microprocessor Engineering}, optaddress = {}, edition = {4.72}, optmonth = {}, year = {2016}, optnote = {}, optannote = {} } @Article{paysan19, author = {Bernd Paysan}, title = {{Constant Folding f\"ur Gforth}}, journal = {Vierte Dimension}, year = {2019}, OPTkey = {}, volume = {35}, number = {2}, pages = {17}, OPTmonth = {}, url = {https://wiki.forth-ev.de/lib/exe/fetch.php/vd-archiv:4d2019-02.pdf}, OPTannote = {} } @Misc{fish11, author = {Fish}, title = {Labor of Division ({Episode III}): Faster Unsigned Division by Constants}, howpublished = {Blog Entry \url{https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html}}, month = oct, year = {2011}, url = {http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf}, OPTannote = {} } @Misc{granlund19, author = {Torbj\"orn Granlund}, OPTtitle = {Instruction latencies and throughput for AMD and Intel x86 processors}, howpublished = {\url{http://gmplib.org/~tege/x86-timing.pdf}}, year = {2019}, url = {http://gmplib.org/~tege/x86-timing.pdf}, annote = {Contains instruction timing information for AMD64 and IA-32 instructions for pretty much all microarchitectures in a format that makes it easy to see how an instruction performs across a wide variety of microarchitectures. By contrast, Agner Fog's work is more comprehensive, but less handy.} } @Misc{fog19, author = {Agner Fog}, title = {Instruction tables: Lists of instruction latencies, throughputs and micro-operation breakdowns for Intel, AMD and VIA CPUs}, howpublished = {\url{https://www.agner.org/optimize/instruction_tables.pdf}}, year = {2019}, url = {https://www.agner.org/optimize/instruction_tables.pdf}, tableurl = {https://www.agner.org/optimize/instruction_tables.ods}, annote = {Instruction timing and other microarchitectural information for IA-32 and AMD64 microarchitectures.} } @Misc{fog22, author = {Agner Fog}, title = {Instruction tables: Lists of instruction latencies, throughputs and micro-operation breakdowns for Intel, AMD and VIA CPUs}, howpublished = {\url{https://www.agner.org/optimize/instruction_tables.pdf}}, year = {2022}, url = {https://www.agner.org/optimize/instruction_tables.pdf}, tableurl = {https://www.agner.org/optimize/instruction_tables.ods}, annote = {Instruction timing and other microarchitectural information for IA-32 and AMD64 microarchitectures.} } @Article{ragsdale81, author = {William F. Ragsdale}, title = {A new syntax for defining defining words}, journal = {Forth Dimensions}, year = {1981}, volume = {II}, number = {5}, pages = {121--128}, month = {January/February}, annote = {Proposes a syntax for defining defining words where all the non-string header information about a word is first collected on the stack, before the word is finally created with \texttt{builds>}.} } @InProceedings{jangda+19, author = {Abhinav Jangda and Bobby Powers and Emery D. Berger and Arjun Guha}, title = {Not So Fast: Analyzing the Performance of WebAssembly vs. Native Code}, booktitle = {2019 USENIX Annual Technical Conference}, year = {2019}, pages = {107--120}, month = jul, url1 = {https://www.usenix.org/system/files/atc19-jangda.pdf}, url2 = {https://arxiv.org/abs/1901.09056}, annote = {WebAssembly has had only small benchmarks because it did not support enough library/system calls for more substantial benchmarks. In this paper Browsix (which provides such calls) is ported from JavaScript to WebAssembly, a SPEC harness for running SPEC benchmarks compiled to WebAssembly is developed, and SPEC 2006 and 2017 benchmarks are run on WebAssembly and compared with native code (compiled with clang). On the SPEC benchmarks WebAssembly shows a bigger slowdown (factor 1.45 and 1.55 geomean, factor 2.08 and 2.5 worst) compared to native on SPEC than on the previous benchmarks. The reasons for this are worse register allocation (from worse code generation and reserved registers) and other code generation shortcomings (the need for fast compilation may be a contributing factor), and various checks required for the safety goals of WebAssembly. Unfortunately, the paper does not identify how much of the slowdown could be avoided by better code generation.} } @Article{fenwick01, author = {Peter Fenwick}, title = {Fast String Matching for Multiple Searches}, journal = spe, year = {2001}, volume = {31}, pages = {815-833}, url = {https://www.cs.auckland.ac.nz/~peter-f/FTPfiles/2001%20String%20Searches.pdf}, annote = {Compares a number of string search algorithms on actual data, giving both operation counts and timing results. Knuth-Morris-Pratt is slower than the naive algorithm by a factor of 3 on the test case he used (searching for a word from a paragraph in a book).} } @Book{charras&lecroq04, author = {Christian Charras and Thierry Lecroq}, title = {Handbook of Exact String-Matching Algorithms}, publisher = {College Publications}, year = {2004}, ISBN = {978-0-9543006-4-7}, url1 = {http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf}, url2 = {http://www-igm.univ-mlv.fr/~lecroq/string/index.html}, annote = {Explains 34 string searching algorithms, each with C code and an example of its operation.} } @InProceedings{dhurjati03, author = {Memory Safety Without Runtime Checks or Garbage Collection}, title = {Dinakar Dhurjati and Sumant Kowshik and Vikram Adve and Chris Lattner}, booktitle = {Proceedings of the 2003 ACM SIGPLAN Conference on Language, Compiler, and Tool for Embedded Systems (LCTES)}, year = {2003}, pages = {69--80}, url = {https://llvm.org/pubs/2003-05-05-LCTES03-CodeSafety.pdf}, OPTannote = {} } @Article{mccourt&marisa81, author = {Michael McCourt and Richard A. Marisa}, title = {The String Stack}, journal = {Forth Dimensions}, year = {1981}, volume = {III}, number = {4}, pages = {121--124}, OPTmonth = nov, url = {https://www.complang.tuwien.ac.at/forth/forth-dimensions/FD-V3.pdf}, OPTannote = {} } @TechReport{ozturk+12, author = {Erdinc Ozturk and James Guilford and Vinodh Gopal and Wajdi Feghali}, title = {New Instructions Supporting Large Integer Arithmetic on {Intel} Architecture Processors}, institution = {Intel}, year = {2012}, type = {White Paper}, number = {327831-001}, url = {https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf}, OPTannote = {} } @Misc{360assembly, title = {Wikibook: 360 Assembly}, howpublished = {\url{https://en.wikibooks.org/w/index.php?title=360_Assembly/360_Instructions&stableid=4078098}}, year = {2022}, OPTnote = {Wikibook}, OPTannote = {} } @Manual{mips64r6vI-A, title = {MIPS Architecture For Programmers Volume I-A: Introduction to the MIPS64 Architecture}, OPTkey = {}, OPTauthor = {}, organization = {MIPS}, edition = {Revision 6.01}, year = {2014}, OPTnote = {Document Number: MD00083}, OPTannote = {} } @Manual{powerpcuisa, title = {PowerPC User Instruction Set Architecture -- Book I}, OPTauthor = {}, organization = {IBM}, OPTaddress = {}, edition = {Version 2.02}, year = {2005}, OPTnote = {}, OPTannote = {} } @MastersThesis{kleffner17, author = {Robert Kleffner}, title = {A Foundation for Typed Concatenative Languages}, school = {Northeastern University}, year = {2017}, url = {https://www2.ccs.neu.edu/racket/pubs/dissertation-kleffner.pdf}, OPTannote = {} } @MastersThesis{riegler15, author = {Gregor Riegler}, title = {Evaluation and Implementation of an Optional, Pluggable Type System for {Forth}}, school = {Technische Universit\"at Wien}, year = {2015}, url1 = {https://www.complang.tuwien.ac.at/Diplomarbeiten/riegler15.pdf}, url2 = {https://repositum.tuwien.at/handle/20.500.12708/7117}, OPTannote = {} } @Article{titzer22, author = {Ben L. Titzer}, title = {A Fast In-Place Interpreter for {WebAssembly}}, journal = {Proc. ACM Program. Lang}, year = {2022}, volume = {6}, number = {OOPSLA2}, pages = {148:1--148:27}, annote = {} } @Book{waterman&asanovic17, ALTauthor = {}, editor = {Andrew Waterman and Krste Asanovi\`{c}}, title = {The RISC-V Instruction Set Manual, Volume I: User-Level ISA}, publisher = {RISC-V Foundation}, year = {2017}, edition = {Document Version~2.2}, month = may, url = {https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf}, OPTannote = {} } @TechReport{celio+16, author = {Christopher Celio and Daniel Dabbelt and David A. Patterson and Krste Asanovi\'{c}}, title = {The Renewed Case for the Reduced Instruction Set Computer: Avoiding {ISA} Bloat with Macro-Op Fusion for {RISC-V}}, institution = {Berkeley}, year = {2016}, OPTkey = {}, OPTtype = {}, number = {UCB/EECS-2016-130}, OPTaddress = {}, OPTmonth = {}, url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-130.html}, url1 = {https://arxiv.org/pdf/1607.02318.pdf}, OPTannote = {} } @Unpublished{bernstein05, author = {Bernstein, Daniel J.}, title = {Cache-timing attacks on {AES}}, note = {}, year = {2005}, url={https://cr.yp.to/antiforgery/cachetiming-20050414.pdf}, OPTannote = {} } @Conference{yan+18, author = {Mengjia Yan and Jiho Choi and Dimitrios Skarlatos and Adam Morrison and Christopher Fletcher and Josep Torrellas}, title = {{InvisiSpec}: Making Speculative Execution Invisible in the Cache Hierarchy}, booktitle = {International Symposium on Microarchitecture}, year = {2018}, pages = {428–441}, url = {https://iacoma.cs.uiuc.edu/iacoma-papers/micro18.pdf}, OPTannote = {} } @Conference{khasawneh+19, author = {Khaled N. Khasawneh and Esmaeil Mohammadian Koruyeh and Chengyu Song and Dmitry Evtyushkin and Dmitry Ponomarev and Nael Abu-Ghazaleh}, title = {{SafeSpec}: Banishing the {Spectre} of a {Meltdown} with Leakage-Free Speculation}, booktitle = {Design Automation Conference}, year = {2019}, OPTpages = {1--6}, url = {https://www.cs.ucr.edu/~csong/dac19-safespec.pdf}, OPTannote = {} } @Conference{ainsworth&jones20, author = {Sam Ainsworth and Timothy M. Jones}, title = {{MuonTrap}: Preventing Cross-Domain {Spectre}-Like Attacks by Capturing Speculative State}, booktitle = {International Symposium on Computer Architecture (ISCA)}, year = {2020}, pages = {132-144}, url = {https://www.cl.cam.ac.uk/~tmj32/papers/docs/ainsworth20-isca.pdf}, OPTannote = {} } @Conference{saileshwar&qureshi19, author = {Gururaj Saileshwar and Moinuddin K. Qureshi}, title = {{CleanupSpec}: An undo approach to safe speculation}, booktitle = {International Symposium on Microarchitecture}, year = {2019}, pages = {73–86}, url = {https://memlab.ece.gatech.edu/papers/MICRO_2019_3.pdf}, OPTannote = {} } @InProceedings{behnia+21, author = {Mohammad Behnia and Prateek Sahu and Riccardo Paccagnella and Jiyong Yu and Zirui Neil Zhao and Xiang Zou and Thomas Unterluggauer and Josep Torrellas and Carlos Rozas and Adam Morrison and Frank Mckeen and Fangfei Liu and Ron Gabor and Christo- pher W. Fletcher and Abhishek Basak and Alaa Alameldeen}, title = {Speculative Interference Attacks: Breaking Invisible Speculation Schemes}, booktitle = {Architectural Support for Programming Languages and Operating Systems (ASPLOS ’21)}, year = {2021}, pages = {1046--1060}, url = {https://dl.acm.org/doi/10.1145/3445814.3446708}, OPTannote = {} } @Conference{bhattacharyya+19, author = {Atri Bhattacharyya and Alexandra Sandulescu and Matthias Neugschwandtner and Alessandro Sorniotti and Babak Falsafi and Mathias Payer and Anil Kurmus}, title = {{SMoTherSpectre}: Exploiting speculative execution through port contention}, booktitle = {Conference on Computer and Communications Security}, year = {2019}, OPTpages = {785–800}, OPTmonth = {}, OPTaddress = {}, OPTorganization = {}, OPTpublisher = {}, url = {https://arxiv.org/abs/1903.01843}, OPTannote = {} } @misc{randal23lose, title={This is How You Lose the Transient Execution War}, author={Allison Randal}, year={2023}, eprint={2309.03376}, archivePrefix={arXiv}, primaryClass={cs.CR}, annote = {This is a survey paper about transient execution attacks (Spectre and friends) and various mitigations and related topics, with 269 references (in v1). This work starts with a discussion of side channels in general before continuing with Spectre and friends. For Spectre and friends the discussion covers the wide range of variants, as well as several categories of countermeasures.} } @Misc{tracy84, author = {Martin Tracy}, title = {SVARS.TXT}, howpublished = {Handout to the L.A. FIG group in 1984}, year = {1984}, abstract = {The following screens cover specialized variables and related words such as QUAN, TO, AT, IS, and VALUE. These screens were originally written by Martin Tracy (MicroMotion) as a handout to the L.A. FIG group in 1984.}, url = {https://pastebin.com/p5P5EVTm}, OPTannote = {} } @Article{wimmer+13, author = {Christian Wimmer and Michael Haupt and Michael L. Van De Vanter and Mick Jordan and Laurent Dayn\`{e}s and Douglas Simon}, title = {{Maxine}: An Approachable Virtual Machine For, and In, {Java}}, journal = {ACM Transactions on Architecture and Code Optimization}, year = {2013}, volume = {9}, number = {4}, pages = {30:1--30:24}, month = jan, annote = {Provides an overview of a Java VM implementation that is intended to be easy to understand, in particular by being written in Java. In particular, it uses a template-based baseline compiler where source code of the templates is in Java (and the native code is coming from the optimizing compiler or from HotSpot).} } @Article{iliasov03, author = {Alex Iliasov}, title = {Templates-based portable Just-In-Time compiler}, journal = {SIGPLAN Notices}, year = {2003}, volume = {38}, number = {8}, pages = {37--43}, month = aug, annote = {Describes a JIT compilation technique that copies native code generated by a C-compiler. To make the code copyable, it has to be relocatable, and the paper discusses how to achieve this. For dealing with literals, the literal is apparently patched into the native code (using machine-specific code). Control flow is apparently performed by loading a literal and performing an indirect jump to it, reducing the machine-specific code to just dealing with literals (while in related work Ertl and Gregg \cite{ertl&gregg04pact} patch both literals and control-flow instructions). The author presents results for the Lemick bytecode interpreter and the JIT compiler derived from that, and compares it to Kaffee, C, and Perl.} } @Book{huff54, author = {Darrel Huff}, title = {How to Lie With Statistics}, publisher = {W.W.Norton \& Company}, year = {1954}, OPTannote = {} } @Book{beck-bornholdt&dubben97, author = {Hans-Peter Beck-Bornholdt and Hans-Hermann Dubben}, title = {Der Hund, der Eier legt}, publisher = {Rowohlt}, year = {1997}, OPTannote = {} } @Book{tufte01, author = {Edward R. Tufte}, title = {The Visual Display of Quantitative Information}, publisher = {Graphics Pr}, year = {2001}, edition = {2nd}, OPTannote = {} } @Article{xu&kjolstad21, author = {Haoran Xu and Fredrik Kjolstad}, title = {Copy-and-Patch Compilation}, journal = pacmpl, year = {2021}, volume = {5}, number = {OOPSLA}, pages = {136:1--136:30}, month = oct, url = {https://fredrikbk.com/publications/copy-and-patch.pdf}, annote = {Presents a compilation approach that produces code snippets (called ``binary stencils'') with a C++ compiler, with each code snippet for a virtual-machine instruction (``bytecode'') or AST node, or a superinstruction/node; each code snippet is created as a C++ function that ends in a tail-call and is compiled with LLVM with the GHC calling convention to object files. From the object files the binary code and relocation data are extracted. Code is compiled by concatenating the code pieces and filling in (patching) the holes described by the relocation data; the branch at the end is left away if the control flow is from the present snippet to the next one compiled. The advantage of this method is that it requires relatively little architecture-specific code; the authors actually claim that this technique is ``the system does not need any knowledge of platform-specific machine instruction encoding and is thus portable'', but they are mistaken: the relocation is architecture-specific, and leaving away the branch at the end of each code snippet can also not be done without platform-specific knowledge. The authors also do not discuss how to check whether the resulting code can be copied, and what to do if it cannot. These snippets are generated with a generator implemented using the C++ template language (including support for superinstructions/nodes). The authors describe an AST-based compiler that uses these snippets to achieve fast startup times with reasonably fast code. They evaluate their approach by implementing a WebAssembly compiler and comparing it with several other WebAssembly systems, and by building an AST-based compiler for a language that is used for several queries of the TCP-H benchmark; their approach outperforms the competition in compile time or execution time, sometimes both.} }