diff --git a/libs/libarchfpga/src/arch_check.cpp b/libs/libarchfpga/src/arch_check.cpp index c8fb00299c4..5360d6e4c02 100644 --- a/libs/libarchfpga/src/arch_check.cpp +++ b/libs/libarchfpga/src/arch_check.cpp @@ -32,7 +32,7 @@ bool check_model_clocks(t_model* model, const char* file, uint32_t line) { bool check_model_combinational_sinks(const t_model* model, const char* file, uint32_t line) { //Outputs should have no combinational sinks for (t_model_ports* port = model->outputs; port != nullptr; port = port->next) { - if (port->combinational_sink_ports.size() != 0) { + if (!port->combinational_sink_ports.empty()) { archfpga_throw(file, line, "Model '%s' output port '%s' can not have combinational sink ports", model->name, port->name); @@ -114,9 +114,9 @@ void check_port_direct_mappings(t_physical_tile_type_ptr physical_tile, t_sub_ti } for (auto pin_map : pin_direct_map) { - auto block_port = get_port_by_pin(logical_block, pin_map.first.pin); + const t_port* block_port = logical_block->get_port_by_pin(pin_map.first.pin); - auto sub_tile_port = get_port_by_pin(sub_tile, pin_map.second.pin); + const t_physical_tile_port* sub_tile_port = sub_tile->get_port_by_pin(pin_map.second.pin); VTR_ASSERT(block_port != nullptr); VTR_ASSERT(sub_tile_port != nullptr); diff --git a/libs/libarchfpga/src/arch_util.h b/libs/libarchfpga/src/arch_util.h index c39cf77b94f..fb251bffe10 100644 --- a/libs/libarchfpga/src/arch_util.h +++ b/libs/libarchfpga/src/arch_util.h @@ -23,8 +23,8 @@ class InstPort { InstPort() = default; InstPort(const std::string& str); - std::string instance_name() const { return instance_.name; } - std::string port_name() const { return port_.name; } + const std::string& instance_name() const { return instance_.name; } + const std::string& port_name() const { return port_.name; } int instance_low_index() const { return instance_.low_idx; } int instance_high_index() const { return instance_.high_idx; } @@ -40,7 +40,7 @@ class InstPort { private: struct name_index { - std::string name = ""; + std::string name; int low_idx = UNSPECIFIED; int high_idx = UNSPECIFIED; }; diff --git a/libs/libarchfpga/src/physical_types.cpp b/libs/libarchfpga/src/physical_types.cpp index 3bdabaee2a7..bdacf50931d 100644 --- a/libs/libarchfpga/src/physical_types.cpp +++ b/libs/libarchfpga/src/physical_types.cpp @@ -136,6 +136,56 @@ bool t_physical_tile_type::is_empty() const { return name == std::string(EMPTY_BLOCK_NAME); } +int t_physical_tile_type::find_pin(std::string_view port_name, int pin_index_in_port) const { + int ipin = OPEN; + int port_base_ipin = 0; + int num_port_pins = OPEN; + int pin_offset = 0; + + bool port_found = false; + for (const t_sub_tile& sub_tile : sub_tiles) { + for (const t_physical_tile_port& port : sub_tile.ports) { + if (port_name == port.name) { + port_found = true; + num_port_pins = port.num_pins; + break; + } + + port_base_ipin += port.num_pins; + } + + if (port_found) { + break; + } + + port_base_ipin = 0; + pin_offset += sub_tile.num_phy_pins; + } + + if (num_port_pins != OPEN) { + VTR_ASSERT(pin_index_in_port < num_port_pins); + + ipin = port_base_ipin + pin_index_in_port + pin_offset; + } + + return ipin; +} + +int t_physical_tile_type::find_pin_class(std::string_view port_name, int pin_index_in_port, e_pin_type pin_type) const { + int iclass = OPEN; + + int ipin = find_pin(port_name, pin_index_in_port); + + if (ipin != OPEN) { + iclass = pin_class[ipin]; + + if (iclass != OPEN) { + VTR_ASSERT(class_inf[iclass].type == pin_type); + } + } + return iclass; +} + /* * t_logical_block_type */ @@ -144,6 +194,28 @@ bool t_logical_block_type::is_empty() const { return name == std::string(EMPTY_BLOCK_NAME); } +const t_port* t_logical_block_type::get_port(std::string_view port_name) const { + for (int i = 0; i < pb_type->num_ports; i++) { + auto port = pb_type->ports[i]; + if (port_name == port.name) { + return &pb_type->ports[port.index]; + } + } + + return nullptr; +} + +const t_port* t_logical_block_type::get_port_by_pin(int pin) const { + for (int i = 0; i < pb_type->num_ports; i++) { + const t_port& port = pb_type->ports[i]; + if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) { + return &pb_type->ports[port.index]; + } + } + + return nullptr; +} + /** * t_pb_graph_node */ @@ -220,7 +292,7 @@ std::string t_pb_graph_pin::to_string(const bool full_description) const { return pin_string; } -/** +/* * t_pb_graph_edge */ @@ -253,3 +325,39 @@ bool t_pb_graph_edge::belongs_to_pattern(int pattern_index) const { // return false otherwise return false; } + +/* + * t_sub_tile + */ + +int t_sub_tile::total_num_internal_pins() const { + int num_pins = 0; + + for (t_logical_block_type_ptr eq_site : equivalent_sites) { + num_pins += (int)eq_site->pin_logical_num_to_pb_pin_mapping.size(); + } + + num_pins *= capacity.total(); + + return num_pins; +} + +const t_physical_tile_port* t_sub_tile::get_port(std::string_view port_name) { + for (const t_physical_tile_port& port : ports) { + if (port_name == port.name) { + return &ports[port.index]; + } + } + + return nullptr; +} + +const t_physical_tile_port* t_sub_tile::get_port_by_pin(int pin) const { + for (const t_physical_tile_port& port : ports) { + if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) { + return &ports[port.index]; + } + } + + return nullptr; +} \ No newline at end of file diff --git a/libs/libarchfpga/src/physical_types.h b/libs/libarchfpga/src/physical_types.h index 4d415697554..c11f1c451ee 100644 --- a/libs/libarchfpga/src/physical_types.h +++ b/libs/libarchfpga/src/physical_types.h @@ -24,8 +24,7 @@ * Authors: Jason Luu and Kenneth Kent */ -#ifndef PHYSICAL_TYPES_H -#define PHYSICAL_TYPES_H +#pragma once #include #include @@ -704,11 +703,7 @@ struct t_physical_tile_type { * tile_block_pin_directs_map[logical block index][logical block pin] -> physical tile pin */ std::unordered_map>> tile_block_pin_directs_map; - /* Returns the indices of pins that contain a clock for this physical logic block */ - std::vector get_clock_pins_indices() const; - // Returns the sub tile location of the physical tile given an input pin - int get_sub_tile_loc_from_pin(int pin_num) const; // TODO: Remove is_input_type / is_output_type as part of // https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1193 @@ -719,8 +714,21 @@ struct t_physical_tile_type { // Does this t_physical_tile_type contain an outpad? bool is_output_type = false; - // Is this t_physical_tile_type an empty type? + public: // Function members + ///@brief Returns the indices of pins that contain a clock for this physical logic block + std::vector get_clock_pins_indices() const; + + ///@brief Returns the sub tile location of the physical tile given an input pin + int get_sub_tile_loc_from_pin(int pin_num) const; + + ///@brief Is this t_physical_tile_type an empty type? bool is_empty() const; + + ///@brief Returns the relative pin index within a sub tile that corresponds to the pin within the given port and its index in the port + int find_pin(std::string_view port_name, int pin_index_in_port) const; + + ///@brief Returns the pin class associated with the specified pin_index_in_port within the port port_name on type + int find_pin_class(std::string_view port_name, int pin_index_in_port, e_pin_type pin_type) const; }; /* Holds the capacity range of a certain sub_tile block within the parent physical tile type. @@ -796,6 +804,19 @@ struct t_sub_tile { int num_phy_pins = 0; int index = -1; + + public: + int total_num_internal_pins() const; + + /** + * @brief Returns the physical tile port given the port name and the corresponding sub tile + */ + const t_physical_tile_port* get_port(std::string_view port_name); + + /** + * @brief Returns the physical tile port given the pin name and the corresponding sub tile + */ + const t_physical_tile_port* get_port_by_pin(int pin) const; }; /** A logical pin defines the pin index of a logical block type (i.e. a top level PB type) @@ -950,6 +971,17 @@ struct t_logical_block_type { // Is this t_logical_block_type empty? bool is_empty() const; + + public: + /** + * @brief Returns the logical block port given the port name and the corresponding logical block type + */ + const t_port* get_port(std::string_view port_name) const; + + /** + * @brief Returns the logical block port given the pin name and the corresponding logical block type + */ + const t_port* get_port_by_pin(int pin) const; }; /************************************************************************************************* @@ -2124,5 +2156,3 @@ struct t_arch { /// Stores NoC-related architectural information when there is an embedded NoC t_noc_inf* noc = nullptr; }; - -#endif diff --git a/libs/libarchfpga/src/physical_types_util.cpp b/libs/libarchfpga/src/physical_types_util.cpp index 2256f81d66c..2ecc7fbd41c 100644 --- a/libs/libarchfpga/src/physical_types_util.cpp +++ b/libs/libarchfpga/src/physical_types_util.cpp @@ -154,7 +154,7 @@ static std::tuple get_pin_index_for_inst(t_physical_til pin_inst_num = (pin_physical_num - pin_offset) % pins_per_inst; } else { int pin_offset = get_sub_tile_inst_physical_pin_num_offset(type, sub_tile, sub_tile_cap); - int pins_per_inst = get_total_num_sub_tile_internal_pins(sub_tile) / sub_tile->capacity.total(); + int pins_per_inst = sub_tile->total_num_internal_pins() / sub_tile->capacity.total(); pin_inst_num = (pin_physical_num - pin_offset) % pins_per_inst; } @@ -225,7 +225,7 @@ static int get_sub_tile_physical_pin_num_offset(t_physical_tile_type_ptr physica if (&tmp_sub_tile == curr_sub_tile) break; else - offset += get_total_num_sub_tile_internal_pins(&tmp_sub_tile); + offset += tmp_sub_tile.total_num_internal_pins(); } return offset; @@ -235,7 +235,7 @@ static int get_sub_tile_inst_physical_pin_num_offset(t_physical_tile_type_ptr ph const t_sub_tile* curr_sub_tile, const int curr_relative_cap) { int offset = get_sub_tile_physical_pin_num_offset(physical_tile, curr_sub_tile); - int sub_tile_inst_num_pins = get_total_num_sub_tile_internal_pins(curr_sub_tile) / curr_sub_tile->capacity.total(); + int sub_tile_inst_num_pins = curr_sub_tile->total_num_internal_pins() / curr_sub_tile->capacity.total(); offset += (curr_relative_cap * sub_tile_inst_num_pins); @@ -563,57 +563,6 @@ int get_max_num_pins(t_logical_block_type_ptr logical_block) { return max_num_pins; } -//Returns the pin class associated with the specified pin_index_in_port within the port port_name on type -int find_pin_class(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port, e_pin_type pin_type) { - int iclass = OPEN; - - int ipin = find_pin(type, port_name, pin_index_in_port); - - if (ipin != OPEN) { - iclass = type->pin_class[ipin]; - - if (iclass != OPEN) { - VTR_ASSERT(type->class_inf[iclass].type == pin_type); - } - } - return iclass; -} - -int find_pin(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port) { - int ipin = OPEN; - int port_base_ipin = 0; - int num_pins = OPEN; - int pin_offset = 0; - - bool port_found = false; - for (const auto& sub_tile : type->sub_tiles) { - for (const auto& port : sub_tile.ports) { - if (0 == strcmp(port.name, port_name.c_str())) { - port_found = true; - num_pins = port.num_pins; - break; - } - - port_base_ipin += port.num_pins; - } - - if (port_found) { - break; - } - - port_base_ipin = 0; - pin_offset += sub_tile.num_phy_pins; - } - - if (num_pins != OPEN) { - VTR_ASSERT(pin_index_in_port < num_pins); - - ipin = port_base_ipin + pin_index_in_port + pin_offset; - } - - return ipin; -} - std::pair get_capacity_location_from_physical_pin(t_physical_tile_type_ptr physical_tile, int pin) { int pins_to_remove = 0; for (const auto& sub_tile : physical_tile->sub_tiles) { @@ -638,7 +587,7 @@ std::pair get_capacity_location_from_physical_pin(t_physical_tile_type int get_physical_pin_from_capacity_location(t_physical_tile_type_ptr physical_tile, int relative_pin, int capacity_location) { int pins_to_add = 0; - for (auto sub_tile : physical_tile->sub_tiles) { + for (const t_sub_tile& sub_tile : physical_tile->sub_tiles) { auto capacity = sub_tile.capacity; int rel_capacity = capacity_location - capacity.low; int num_inst_pins = sub_tile.num_phy_pins / capacity.total(); @@ -841,52 +790,6 @@ std::vector block_type_class_index_to_pin_names(t_physical_tile_typ return pin_names; } -const t_physical_tile_port* get_port_by_name(t_sub_tile* sub_tile, const char* port_name) { - for (auto port : sub_tile->ports) { - if (0 == strcmp(port.name, port_name)) { - return &sub_tile->ports[port.index]; - } - } - - return nullptr; -} - -const t_port* get_port_by_name(t_logical_block_type_ptr type, const char* port_name) { - auto pb_type = type->pb_type; - - for (int i = 0; i < pb_type->num_ports; i++) { - auto port = pb_type->ports[i]; - if (0 == strcmp(port.name, port_name)) { - return &pb_type->ports[port.index]; - } - } - - return nullptr; -} - -const t_physical_tile_port* get_port_by_pin(const t_sub_tile* sub_tile, int pin) { - for (auto port : sub_tile->ports) { - if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) { - return &sub_tile->ports[port.index]; - } - } - - return nullptr; -} - -const t_port* get_port_by_pin(t_logical_block_type_ptr type, int pin) { - auto pb_type = type->pb_type; - - for (int i = 0; i < pb_type->num_ports; i++) { - auto port = pb_type->ports[i]; - if (pin >= port.absolute_first_pin_index && pin < port.absolute_first_pin_index + port.num_pins) { - return &pb_type->ports[port.index]; - } - } - - return nullptr; -} - /* Access information related to pin classes */ /** get information given class physical num **/ @@ -1009,7 +912,7 @@ std::tuple get_sub_tile_from_pin_physical_num(t_physical int pin_offset = total_pin_counts; for (auto& sub_tile : physical_tile->sub_tiles) { - int sub_tile_num_pins = pin_on_tile ? sub_tile.num_phy_pins : get_total_num_sub_tile_internal_pins(&sub_tile); + int sub_tile_num_pins = pin_on_tile ? sub_tile.num_phy_pins : sub_tile.total_num_internal_pins(); total_pin_counts += sub_tile_num_pins; if (physical_num < total_pin_counts) { @@ -1347,15 +1250,6 @@ const t_pb_graph_node* get_pb_graph_node_from_pin_physical_num(t_physical_tile_t return pb_graph_pin->parent_node; } -int get_total_num_sub_tile_internal_pins(const t_sub_tile* sub_tile) { - int num_pins = 0; - for (auto eq_site : sub_tile->equivalent_sites) { - num_pins += (int)eq_site->pin_logical_num_to_pb_pin_mapping.size(); - } - num_pins *= sub_tile->capacity.total(); - return num_pins; -} - int get_tile_pin_max_ptc(t_physical_tile_type_ptr tile, bool is_flat) { if (is_flat) { return tile->num_pins + (int)tile->pin_num_to_pb_pin.size(); @@ -1538,4 +1432,3 @@ std::map get_sink_choking_points(t_physical_tile_type_ptr physical_til return choking_point; } -/* */ diff --git a/libs/libarchfpga/src/physical_types_util.h b/libs/libarchfpga/src/physical_types_util.h index aa7b2617834..a081683faeb 100644 --- a/libs/libarchfpga/src/physical_types_util.h +++ b/libs/libarchfpga/src/physical_types_util.h @@ -1,5 +1,5 @@ -#ifndef PHYSICAL_TYPES_UTIL_H -#define PHYSICAL_TYPES_UTIL_H + +#pragma once #include "physical_types.h" @@ -13,11 +13,11 @@ * functions in this file are the following: * * - physical_tile_type: identifies a placeable tile within * * the device grid. * - * - logical_block_tpye: identifies a clustered block type * + * - logical_block_type: identifies a clustered block type * * within the clb_netlist * * * * All the following utilities are intended to ease the * - * developement to access the above mentioned classes and perform * + * development to access the above mentioned classes and perform * * some required operations with their data. * * * * Please classify such functions in this file * @@ -107,7 +107,7 @@ * * For instance, the following information are required: * - mapping between logical and sub tile pins. - * - mapping between sub tile pins and absoulte physical pin + * - mapping between sub tile pins and absolute physical pin * - capacity instance of the sub tile * * With all the above information we can calculate correctly the connection between the CLK (logical pin) @@ -152,12 +152,12 @@ int get_physical_pin_from_capacity_location(t_physical_tile_type_ptr physical_ti * * Take the above CLOCK TILE example: * - given the CLOCK TILE and the index corresponding to the CLK_1 pin, we want the relative pin - * of one of its sub tiles at a particualr capacity location (i.e. sub tile instance). + * of one of its sub tiles at a particular capacity location (i.e. sub tile instance). * * std::tie(absolute_capacity, relative_pin) = get_capacity_location_from_physical_pin(clock_tile, 3) * * The value returned is (1, 0), where: - * - 1 corresponds to the capacity location (sub tile instance) where the absoulte physical pin index (CLK_1) is connected + * - 1 corresponds to the capacity location (sub tile instance) where the absolute physical pin index (CLK_1) is connected * - 0 corresponds to the relative pin index within the BUFGCTRL sub tile */ std::pair get_capacity_location_from_physical_pin(t_physical_tile_type_ptr physical_tile, int pin); @@ -173,11 +173,6 @@ std::vector block_type_class_index_to_pin_names(t_physical_tile_typ ///@brief Returns the physical tile type matching a given physical tile type name, or nullptr (if not found) t_physical_tile_type_ptr find_tile_type_by_name(const std::string& name, const std::vector& types); -int find_pin_class(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port, e_pin_type pin_type); - -///@brief Returns the relative pin index within a sub tile that corresponds to the pin within the given port and its index in the port -int find_pin(t_physical_tile_type_ptr type, std::string port_name, int pin_index_in_port); - ///@brief Returns the maximum number of pins within a logical block int get_max_num_pins(t_logical_block_type_ptr logical_block); @@ -217,7 +212,7 @@ int get_logical_block_physical_sub_tile_index(t_physical_tile_type_ptr physical_ t_logical_block_type_ptr logical_block); /** * @brief Returns the physical pin index (within 'physical_tile') corresponding to the - * logical index ('pin' of the first instance of 'logical_block' within the physcial tile. + * logical index ('pin' of the first instance of 'logical_block' within the physical tile. * * This function is called before/during placement, when a sub tile index was not yet assigned. * @@ -228,7 +223,7 @@ int get_physical_pin(t_physical_tile_type_ptr physical_tile, int pin); /** * @brief Returns the physical pin index (within 'physical_tile') corresponding to the - * logical index ('pin' of the first instance of 'logical_block' within the physcial tile. + * logical index ('pin' of the first instance of 'logical_block' within the physical tile. * This function considers if a given offset is in the range of sub tile capacity * * (First pin index at current sub-tile) (The wanted pin index) @@ -286,26 +281,6 @@ int get_sub_tile_physical_pin(int sub_tile_index, */ t_physical_tile_port find_tile_port_by_name(t_physical_tile_type_ptr type, std::string_view port_name); -/** - * @brief Returns the physical tile port given the port name and the corresponding sub tile - */ -const t_physical_tile_port* get_port_by_name(t_sub_tile* sub_tile, const char* port_name); - -/** - * @brief Returns the logical block port given the port name and the corresponding logical block type - */ -const t_port* get_port_by_name(t_logical_block_type_ptr type, const char* port_name); - -/** - * @brief Returns the physical tile port given the pin name and the corresponding sub tile - */ -const t_physical_tile_port* get_port_by_pin(const t_sub_tile* sub_tile, int pin); - -/** - * @brief Returns the logical block port given the pin name and the corresponding logical block type - */ -const t_port* get_port_by_pin(t_logical_block_type_ptr type, int pin); - /************************************ Access to intra-block resources ************************************/ /* Access information related to pin classes */ @@ -336,12 +311,6 @@ inline bool is_class_on_tile(t_physical_tile_type_ptr physical_tile, int class_p /** * @brief Classes are indexed in a way that the number of classes on the same pb_graph_node is continuous - * @param physical_tile - * @param sub_tile - * @param logical_block - * @param sub_tile_relative_cap - * @param pb_graph_node - * @return */ t_class_range get_pb_graph_node_class_physical_range(t_physical_tile_type_ptr physical_tile, const t_sub_tile* sub_tile, @@ -358,15 +327,11 @@ std::vector get_tile_root_classes(t_physical_tile_type_ptr physical_type); /** * Get the number of all classes, on the tile and inside the cluster. - * @param physical_type - * @return */ t_class_range get_flat_tile_primitive_classes(t_physical_tile_type_ptr physical_type); /** **/ int get_tile_class_max_ptc(t_physical_tile_type_ptr tile, bool is_flat); -/* */ - /* Access information related to pins */ /** get information given pin physical number **/ @@ -434,8 +399,6 @@ int get_edge_sw_arch_idx(t_physical_tile_type_ptr physical_tile, const t_pb_graph_node* get_pb_graph_node_from_pin_physical_num(t_physical_tile_type_ptr physical_type, int pin_physical_num); -int get_total_num_sub_tile_internal_pins(const t_sub_tile* sub_tile); - int get_tile_pin_max_ptc(t_physical_tile_type_ptr tile, bool is_flat); int get_tile_num_internal_pin(t_physical_tile_type_ptr tile); @@ -459,11 +422,6 @@ float get_pin_primitive_comb_delay(t_physical_tile_type_ptr physical_type, /** * @brief This function is used during reachability analysis to check whether two classes should be put in the same group - * @param physical_tile - * @param first_class_ptc_num - * @param second_class_ptc_num - * @param is_flat - * @return */ bool classes_in_same_block(t_physical_tile_type_ptr physical_tile, int first_class_ptc_num, @@ -473,15 +431,8 @@ bool classes_in_same_block(t_physical_tile_type_ptr physical_tile, /** * @brief Given the sink group, identify the pins which can reach both sink_ptc_num and at least one of the sinks, * in the grp. - * @param physical_tile - * @param sink_ptc_num - * @param grp * @return Key is the pin number and value is the number of sinks, including sink_ptc_num, in the grp reachable by the pin */ std::map get_sink_choking_points(t_physical_tile_type_ptr physical_tile, int sink_ptc_num, const std::vector& grp); - -/* */ - -#endif diff --git a/libs/libarchfpga/src/read_xml_arch_file.cpp b/libs/libarchfpga/src/read_xml_arch_file.cpp index 3950eb1b15b..46cde415630 100644 --- a/libs/libarchfpga/src/read_xml_arch_file.cpp +++ b/libs/libarchfpga/src/read_xml_arch_file.cpp @@ -774,7 +774,7 @@ static std::pair ProcessPinString(pugi::xml_node Locations, "No port name is present: %s\n", pin_loc_string); } - auto port = get_port_by_name(type, token.data); + auto port = type->get_port(token.data); if (port == nullptr) { archfpga_throw(loc_data.filename_c_str(), loc_data.line(Locations), "Port %s for %s could not be found: %s\n", diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp index debd89c8bd6..61b4bb644a3 100644 --- a/utils/route_diag/src/main.cpp +++ b/utils/route_diag/src/main.cpp @@ -9,13 +9,10 @@ // Tool can either perform one route between a source (--source_rr_node) and // a sink (--sink_rr_node), or profile a source to all tiles (set // --source_rr_node and "--profile_source true"). -#include -#include -#include + #include #include "vtr_error.h" -#include "vtr_memory.h" #include "vtr_log.h" #include "vtr_time.h" @@ -28,16 +25,13 @@ #include "globals.h" #include "net_delay.h" -#include "RoutingDelayCalculator.h" #include "place_and_route.h" #include "router_delay_profiling.h" #include "route_tree.h" #include "route_common.h" #include "route_net.h" -#include "route_export.h" #include "rr_graph.h" -#include "rr_graph2.h" -#include "timing_place_lookup.h" +#include "compute_delta_delays_utils.h" struct t_route_util_options { /* Router diag tool Options */ @@ -238,36 +232,6 @@ static void profile_source(const Netlist<>& net_list, VTR_LOG("\n"); } -static t_chan_width setup_chan_width(t_router_opts router_opts, - t_chan_width_dist chan_width_dist) { - /*we give plenty of tracks, this increases routability for the */ - /*lookup table generation */ - - t_graph_type graph_directionality; - int width_fac; - - if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) { - auto& device_ctx = g_vpr_ctx.device(); - - auto type = find_most_common_tile_type(device_ctx.grid); - - width_fac = 4 * type->num_pins; - /*this is 2x the value that binary search starts */ - /*this should be enough to allow most pins to */ - /*connect to tracks in the architecture */ - } else { - width_fac = router_opts.fixed_channel_width; - } - - if (router_opts.route_type == GLOBAL) { - graph_directionality = GRAPH_BIDIR; - } else { - graph_directionality = GRAPH_UNIDIR; - } - - return init_chan(width_fac, chan_width_dist, graph_directionality); -} - t_route_util_options read_route_util_options(int argc, const char** argv) { //Explicitly initialize for zero initialization t_route_util_options args = t_route_util_options(); @@ -323,17 +287,15 @@ int main(int argc, const char **argv) { const Netlist<>& net_list = is_flat ? (const Netlist<>&)g_vpr_ctx.atom().nlist : (const Netlist<>&)g_vpr_ctx.clustering().clb_nlist; - t_chan_width chan_width = setup_chan_width( - vpr_setup.RouterOpts, - Arch.Chans); + t_chan_width chan_width = setup_chan_width(vpr_setup.RouterOpts, + Arch.Chans); - alloc_routing_structs( - chan_width, - vpr_setup.RouterOpts, - &vpr_setup.RoutingArch, - vpr_setup.Segments, - Arch.directs, - is_flat); + alloc_routing_structs(chan_width, + vpr_setup.RouterOpts, + &vpr_setup.RoutingArch, + vpr_setup.Segments, + Arch.directs, + is_flat); if(route_options.profile_source) { profile_source(net_list, diff --git a/vpr/src/base/place_and_route.cpp b/vpr/src/base/place_and_route.cpp index ba7e20ccd80..7074d34662a 100644 --- a/vpr/src/base/place_and_route.cpp +++ b/vpr/src/base/place_and_route.cpp @@ -1,14 +1,9 @@ -#include #include -#include -#include #include #include #include -#include "vtr_util.h" -#include "vtr_memory.h" #include "vtr_assert.h" #include "vtr_log.h" @@ -16,7 +11,6 @@ #include "vpr_utils.h" #include "vpr_error.h" #include "globals.h" -#include "atom_netlist.h" #include "place_and_route.h" #include "place.h" #include "read_place.h" @@ -24,21 +18,11 @@ #include "route.h" #include "route_export.h" #include "draw.h" -#include "stats.h" -#include "check_route.h" #include "rr_graph.h" -#include "net_delay.h" -#include "timing_place.h" #include "read_xml_arch_file.h" -#include "echo_files.h" #include "route_common.h" -#include "place_macro.h" -#include "power.h" -#include "place_util.h" #include "RoutingDelayCalculator.h" -#include "timing_info.h" -#include "tatum/echo_writer.hpp" /******************* Subroutines local to this module ************************/ @@ -415,6 +399,36 @@ int binary_search_place_and_route(const Netlist<>& placement_net_list, return (final); } +t_chan_width setup_chan_width(const t_router_opts& router_opts, + t_chan_width_dist chan_width_dist) { + /*we give plenty of tracks, this increases routability for the */ + /*lookup table generation */ + + t_graph_type graph_directionality; + int width_fac; + + if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) { + auto& device_ctx = g_vpr_ctx.device(); + + auto type = find_most_common_tile_type(device_ctx.grid); + + width_fac = 4 * type->num_pins; + /*this is 2x the value that binary search starts */ + /*this should be enough to allow most pins to */ + /*connect to tracks in the architecture */ + } else { + width_fac = router_opts.fixed_channel_width; + } + + if (router_opts.route_type == GLOBAL) { + graph_directionality = GRAPH_BIDIR; + } else { + graph_directionality = GRAPH_UNIDIR; + } + + return init_chan(width_fac, chan_width_dist, graph_directionality); +} + /** * @brief Assigns widths to channels (in tracks). * diff --git a/vpr/src/base/place_and_route.h b/vpr/src/base/place_and_route.h index 6f191c0ff9e..538996548f2 100644 --- a/vpr/src/base/place_and_route.h +++ b/vpr/src/base/place_and_route.h @@ -2,11 +2,9 @@ #define VPR_PLACE_AND_ROUTE_H #define INFINITE -1 -#define NOT_FOUND 0 #define WNEED 1 #define WL 2 -#define PROC_TIME 3 #include "vpr_types.h" #include "timing_info.h" @@ -18,7 +16,6 @@ struct t_fmap_cell { int fc; ///& placement_net_list, const std::shared_ptr& delay_calc, bool is_flat); +t_chan_width setup_chan_width(const t_router_opts& router_opts, + t_chan_width_dist chan_width_dist); + t_chan_width init_chan(int cfactor, const t_chan_width_dist& chan_width_dist, t_graph_type graph_directionality); diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 78124dd85c3..145601ac66f 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -2295,7 +2295,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio .show_in(argparse::ShowIn::HELP_ONLY); place_timing_grp.add_argument(args.post_place_timing_report_file, "--post_place_timing_report") - .help("Name of the post-placement timing report file (not generated if unspecfied)") + .help("Name of the post-placement timing report file (not generated if unspecified)") .default_value("") .show_in(argparse::ShowIn::HELP_ONLY); diff --git a/vpr/src/base/read_route.cpp b/vpr/src/base/read_route.cpp index d2d3bc14d54..6ac9d099c4b 100644 --- a/vpr/src/base/read_route.cpp +++ b/vpr/src/base/read_route.cpp @@ -39,12 +39,12 @@ #include "vpr_utils.h" #include "vpr_error.h" #include "place_and_route.h" -#include "timing_place.h" #include "route_export.h" #include "echo_files.h" #include "route_common.h" #include "route_tree.h" #include "read_route.h" +#include "d_ary_heap.h" #include "old_traceback.h" @@ -212,7 +212,6 @@ static void process_nets(const Netlist<>& net_list, std::ifstream& fp, ClusterNe process_nodes(net_list, fp, inet, filename, lineno); } input_tokens.clear(); - return; } static void process_nodes(const Netlist<>& net_list, std::ifstream& fp, ClusterNetId inet, const char* filename, int& lineno) { diff --git a/vpr/src/noc/noc_routing_algorithm_creator.h b/vpr/src/noc/noc_routing_algorithm_creator.h index 8cb9b777949..4c33d13f590 100644 --- a/vpr/src/noc/noc_routing_algorithm_creator.h +++ b/vpr/src/noc/noc_routing_algorithm_creator.h @@ -8,9 +8,10 @@ * * Overview * ======== - * There are a number of different available NoC routing algorithms. This class is a factory object for the NocRouting abstract class. This class constructs - * the appropriate routing algorithm based on the user specification in the - * command line. The user identifies a + * There are a number of different available NoC routing algorithms. + * This class is a factory object for the NocRouting abstract class. + * This class constructs the appropriate routing algorithm based on + * the user specification in the command line. The user identifies a * specific routing algorithm in the command line by providing a string * (which is the name of routing algorithm). * Then the corresponding routing algorithm is created here based on the diff --git a/vpr/src/place/analytic_placer.h b/vpr/src/place/analytic_placer.h index b73b3486f57..b279b82e058 100644 --- a/vpr/src/place/analytic_placer.h +++ b/vpr/src/place/analytic_placer.h @@ -83,7 +83,6 @@ */ # include "vpr_context.h" -# include "timing_place.h" # include "PlacementDelayCalculator.h" /* diff --git a/vpr/src/place/annealer.cpp b/vpr/src/place/annealer.cpp index b18f60b27bd..e6e0ffc85dd 100644 --- a/vpr/src/place/annealer.cpp +++ b/vpr/src/place/annealer.cpp @@ -16,6 +16,8 @@ #include "read_place.h" #include "placer_breakpoint.h" #include "RL_agent_util.h" +#include "PlacerSetupSlacks.h" +#include "PlacerCriticalities.h" /**************************************************************************/ /*************** Static Function Declarations *****************************/ @@ -488,7 +490,7 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator, criticalities_->disable_update(); setup_slacks_->enable_update(); update_timing_classes(crit_params, timing_info_, criticalities_, - setup_slacks_, pin_timing_invalidator_, placer_state_); + setup_slacks_, pin_timing_invalidator_); /* Get the setup slack analysis cost */ //TODO: calculate a weighted average of the slack cost and wiring cost @@ -592,7 +594,7 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator, // Revert the timing update update_timing_classes(crit_params, timing_info_, criticalities_, - setup_slacks_, pin_timing_invalidator_, placer_state_); + setup_slacks_, pin_timing_invalidator_); VTR_ASSERT_SAFE_MSG( verify_connection_setup_slacks(setup_slacks_, placer_state_), diff --git a/vpr/src/place/annealer.h b/vpr/src/place/annealer.h index fd9b0dbd928..f788aea666d 100644 --- a/vpr/src/place/annealer.h +++ b/vpr/src/place/annealer.h @@ -17,6 +17,7 @@ enum class e_agent_state; class NocCostHandler; class NetPinTimingInvalidator; +class PlacerSetupSlacks; /** * These variables keep track of the number of swaps diff --git a/vpr/src/place/move_generators/centroid_move_generator.cpp b/vpr/src/place/move_generators/centroid_move_generator.cpp index 45ba9121719..767fbf2ce7e 100644 --- a/vpr/src/place/move_generators/centroid_move_generator.cpp +++ b/vpr/src/place/move_generators/centroid_move_generator.cpp @@ -44,6 +44,7 @@ e_create_move CentroidMoveGenerator::propose_move(t_pl_blocks_to_be_moved& block ClusterBlockId b_from = propose_block_to_move(placer_opts, proposed_action.logical_blk_type_index, /*highly_crit_block=*/false, + /*placer_criticalities=*/nullptr, /*net_from=*/nullptr, /*pin_from=*/nullptr, placer_state, diff --git a/vpr/src/place/move_generators/critical_uniform_move_generator.cpp b/vpr/src/place/move_generators/critical_uniform_move_generator.cpp index 7a1d39ed308..7d36889c2f6 100644 --- a/vpr/src/place/move_generators/critical_uniform_move_generator.cpp +++ b/vpr/src/place/move_generators/critical_uniform_move_generator.cpp @@ -1,4 +1,6 @@ + #include "critical_uniform_move_generator.h" + #include "globals.h" #include "place_constraints.h" #include "placer_state.h" @@ -13,8 +15,8 @@ e_create_move CriticalUniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved t_propose_action& proposed_action, float rlim, const t_placer_opts& placer_opts, - const PlacerCriticalities* /*criticalities*/) { - auto& cluster_ctx = g_vpr_ctx.clustering(); + const PlacerCriticalities* criticalities) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); const auto& placer_state = placer_state_.get(); const auto& block_locs = placer_state.block_locs(); const auto& blk_loc_registry = placer_state.blk_loc_registry(); @@ -25,6 +27,7 @@ e_create_move CriticalUniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved ClusterBlockId b_from = propose_block_to_move(placer_opts, proposed_action.logical_blk_type_index, /*highly_crit_block=*/true, + criticalities, &net_from, &pin_from, placer_state, diff --git a/vpr/src/place/move_generators/critical_uniform_move_generator.h b/vpr/src/place/move_generators/critical_uniform_move_generator.h index dd4e5391474..68358552668 100644 --- a/vpr/src/place/move_generators/critical_uniform_move_generator.h +++ b/vpr/src/place/move_generators/critical_uniform_move_generator.h @@ -1,7 +1,6 @@ #ifndef VPR_CRITICAL_UNIFORM_MOVE_GEN_H #define VPR_CRITICAL_UNIFORM_MOVE_GEN_H #include "move_generator.h" -#include "timing_place.h" /** * @file diff --git a/vpr/src/place/move_generators/feasible_region_move_generator.cpp b/vpr/src/place/move_generators/feasible_region_move_generator.cpp index 75210dafd43..1c719a7b0ff 100644 --- a/vpr/src/place/move_generators/feasible_region_move_generator.cpp +++ b/vpr/src/place/move_generators/feasible_region_move_generator.cpp @@ -30,6 +30,7 @@ e_create_move FeasibleRegionMoveGenerator::propose_move(t_pl_blocks_to_be_moved& ClusterBlockId b_from = propose_block_to_move(placer_opts, proposed_action.logical_blk_type_index, /*highly_crit_block=*/true, + criticalities, &net_from, &pin_from, placer_state, diff --git a/vpr/src/place/move_generators/feasible_region_move_generator.h b/vpr/src/place/move_generators/feasible_region_move_generator.h index 702f8bdd26c..75304a60fd6 100644 --- a/vpr/src/place/move_generators/feasible_region_move_generator.h +++ b/vpr/src/place/move_generators/feasible_region_move_generator.h @@ -1,10 +1,9 @@ #ifndef VPR_FEASIBLE_REGION_MOVE_GEN_H #define VPR_FEASIBLE_REGION_MOVE_GEN_H #include "move_generator.h" -#include "timing_place.h" /** - * @brief Feasible Reion (FR) move genrator + * @brief Feasible Region (FR) move generator * * This move was originally defined by Chen et al . in "Simultaneous timing-driven placement and duplication", FPGA 2005 * diff --git a/vpr/src/place/move_generators/median_move_generator.cpp b/vpr/src/place/move_generators/median_move_generator.cpp index 2e982ac6425..99c1b892e17 100644 --- a/vpr/src/place/move_generators/median_move_generator.cpp +++ b/vpr/src/place/move_generators/median_move_generator.cpp @@ -28,6 +28,7 @@ e_create_move MedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks_ ClusterBlockId b_from = propose_block_to_move(placer_opts, proposed_action.logical_blk_type_index, /*highly_crit_block=*/false, + /*placer_criticalities=*/nullptr, /*net_from=*/nullptr, /*pin_from=*/nullptr, placer_state, diff --git a/vpr/src/place/move_generators/move_generator.h b/vpr/src/place/move_generators/move_generator.h index e39493e16c6..5ca0b4ce1f5 100644 --- a/vpr/src/place/move_generators/move_generator.h +++ b/vpr/src/place/move_generators/move_generator.h @@ -3,7 +3,7 @@ #include "vpr_types.h" #include "move_utils.h" -#include "timing_place.h" +#include "PlacerCriticalities.h" #include diff --git a/vpr/src/place/move_generators/uniform_move_generator.cpp b/vpr/src/place/move_generators/uniform_move_generator.cpp index 6c6e283ba94..7190918aba3 100644 --- a/vpr/src/place/move_generators/uniform_move_generator.cpp +++ b/vpr/src/place/move_generators/uniform_move_generator.cpp @@ -24,6 +24,7 @@ e_create_move UniformMoveGenerator::propose_move(t_pl_blocks_to_be_moved& blocks ClusterBlockId b_from = propose_block_to_move(placer_opts, proposed_action.logical_blk_type_index, /*highly_crit_block=*/false, + /*placer_criticalities=*/nullptr, /*net_from=*/nullptr, /*pin_from=*/nullptr, placer_state, diff --git a/vpr/src/place/move_generators/weighted_median_move_generator.cpp b/vpr/src/place/move_generators/weighted_median_move_generator.cpp index b391509f5c3..de949d37a75 100644 --- a/vpr/src/place/move_generators/weighted_median_move_generator.cpp +++ b/vpr/src/place/move_generators/weighted_median_move_generator.cpp @@ -30,6 +30,7 @@ e_create_move WeightedMedianMoveGenerator::propose_move(t_pl_blocks_to_be_moved& ClusterBlockId b_from = propose_block_to_move(placer_opts, proposed_action.logical_blk_type_index, /*highly_crit_block=*/false, + /*placer_criticalities=*/nullptr, /*net_from=*/nullptr, /*pin_from=*/nullptr, placer_state, diff --git a/vpr/src/place/move_generators/weighted_median_move_generator.h b/vpr/src/place/move_generators/weighted_median_move_generator.h index a6041f13e87..7da4be46bf6 100644 --- a/vpr/src/place/move_generators/weighted_median_move_generator.h +++ b/vpr/src/place/move_generators/weighted_median_move_generator.h @@ -2,7 +2,6 @@ #define VPR_WEIGHTED_MEDIAN_MOVE_GEN_H #include "move_generator.h" -#include "timing_place.h" /** * @brief The weighted median move generator diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp index b5efb699fc7..d44c3611eca 100644 --- a/vpr/src/place/move_utils.cpp +++ b/vpr/src/place/move_utils.cpp @@ -547,30 +547,24 @@ void enable_placer_debug(const t_placer_opts& placer_opts, ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts, int& logical_blk_type_index, bool highly_crit_block, + const PlacerCriticalities* placer_criticalities, ClusterNetId* net_from, int* pin_from, const PlacerState& placer_state, vtr::RngContainer& rng) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + ClusterBlockId b_from = ClusterBlockId::INVALID(); - auto& cluster_ctx = g_vpr_ctx.clustering(); - if (logical_blk_type_index == -1) { //If the block type is unspecified, choose any random block to be swapped with another random block - if (highly_crit_block) { - b_from = pick_from_highly_critical_block(*net_from, *pin_from, placer_state, rng); - } else { - b_from = pick_from_block(rng); - } + if (highly_crit_block) { + b_from = pick_from_highly_critical_block(*net_from, *pin_from, logical_blk_type_index, placer_state, *placer_criticalities, rng); + } else { + b_from = pick_from_block(logical_blk_type_index, rng); + } - //if a movable block found, set the block type - if (b_from) { - logical_blk_type_index = cluster_ctx.clb_nlist.block_type(b_from)->index; - } - } else { //If the block type is specified, choose a random block with blk_type to be swapped with another random block - if (highly_crit_block) { - b_from = pick_from_highly_critical_block(*net_from, *pin_from, logical_blk_type_index, placer_state, rng); - } else { - b_from = pick_from_block(logical_blk_type_index, rng); - } + //if a movable block found, set the block type + if (b_from) { + logical_blk_type_index = cluster_ctx.clb_nlist.block_type(b_from)->index; } if constexpr (VTR_ENABLE_DEBUG_LOGGING_CONST_EXPR) { @@ -589,99 +583,50 @@ const std::vector& movable_blocks_per_type(const t_logical_block return place_ctx.movable_blocks_per_type[blk_type.index]; } -ClusterBlockId pick_from_block(vtr::RngContainer& rng) { - auto& place_ctx = g_vpr_ctx.placement(); - - // get the number of movable clustered blocks - const size_t n_movable_blocks = place_ctx.movable_blocks.size(); - - if (n_movable_blocks > 0) { - //Pick a movable block at random and return it - auto b_from = ClusterBlockId(rng.irand((int)n_movable_blocks - 1)); - return b_from; - } else { - //No movable blocks found - return ClusterBlockId::INVALID(); - } -} - ClusterBlockId pick_from_block(const int logical_blk_type_index, vtr::RngContainer& rng) { - auto& place_ctx = g_vpr_ctx.placement(); - - const auto& movable_blocks_of_type = place_ctx.movable_blocks_per_type[logical_blk_type_index]; - - if (movable_blocks_of_type.empty()) { - return ClusterBlockId::INVALID(); - } - - auto b_from = ClusterBlockId(movable_blocks_of_type[rng.irand((int)movable_blocks_of_type.size() - 1)]); - - return b_from; -} - -//Pick a random highly critical block to be swapped with another random block. -//If none is found return ClusterBlockId::INVALID() -ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from, - int& pin_from, - const PlacerState& placer_state, - vtr::RngContainer& rng) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& place_move_ctx = placer_state.move(); - auto& block_locs = placer_state.block_locs(); + const auto& place_ctx = g_vpr_ctx.placement(); - //Initialize critical net and pin to be invalid - net_from = ClusterNetId::INVALID(); - pin_from = -1; + // if logical block type is specified, pick the 'from' block from blocks of that type; + // otherwise, select it randomly from all blocks + const auto& movable_blocks = (logical_blk_type_index < 0 )? place_ctx.movable_blocks : place_ctx.movable_blocks_per_type[logical_blk_type_index]; - //check if any critical block is available - if (place_move_ctx.highly_crit_pins.empty()) { + if (movable_blocks.empty()) { return ClusterBlockId::INVALID(); } - //pick a random highly critical pin and find the nets driver block - std::pair crit_pin = place_move_ctx.highly_crit_pins[rng.irand(place_move_ctx.highly_crit_pins.size() - 1)]; - ClusterBlockId b_from = cluster_ctx.clb_nlist.net_driver_block(crit_pin.first); + ClusterBlockId b_from = movable_blocks[rng.irand((int)movable_blocks.size() - 1)]; - if (block_locs[b_from].is_fixed) { - return ClusterBlockId::INVALID(); //Block is fixed, cannot move - } - - net_from = crit_pin.first; - pin_from = crit_pin.second; return b_from; - - //Unreachable statement - return ClusterBlockId::INVALID(); } -//Pick a random highly critical block with a specified block type to be swapped with another random block. -//If none is found return ClusterBlockId::INVALID() ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from, int& pin_from, const int logical_blk_type_index, const PlacerState& placer_state, + const PlacerCriticalities& placer_criticalities, vtr::RngContainer& rng) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& place_move_ctx = placer_state.move(); - auto& block_locs = placer_state.block_locs(); + const auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& block_locs = placer_state.block_locs(); //Initialize critical net and pin to be invalid net_from = ClusterNetId::INVALID(); pin_from = -1; + const auto& highly_crit_pins = placer_criticalities.get_highly_critical_pins(); + //check if any critical block is available - if (place_move_ctx.highly_crit_pins.empty()) { + if (highly_crit_pins.empty()) { return ClusterBlockId::INVALID(); } //pick a random highly critical pin and find the nets driver block - std::pair crit_pin = place_move_ctx.highly_crit_pins[rng.irand(place_move_ctx.highly_crit_pins.size() - 1)]; + std::pair crit_pin = highly_crit_pins[rng.irand(highly_crit_pins.size() - 1)]; ClusterBlockId b_from = cluster_ctx.clb_nlist.net_driver_block(crit_pin.first); //Check if picked block type matches with the blk_type specified, and it is not fixed //blk_type from propose move doesn't account for the EMPTY type auto b_from_type = cluster_ctx.clb_nlist.block_type(b_from); - if (b_from_type->index == logical_blk_type_index) { + if (b_from_type->index == logical_blk_type_index || logical_blk_type_index < 0) { if (block_locs[b_from].is_fixed) { return ClusterBlockId::INVALID(); //Block is fixed, cannot move } @@ -692,7 +637,6 @@ ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from, } //No critical block with 'blk_type' found - //Unreachable statement return ClusterBlockId::INVALID(); } @@ -707,7 +651,7 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type, // //Note that the range limit (rlim) is applied in a logical sense (i.e. 'compressed' grid space consisting //of the same block types, and not the physical grid space). This means, for example, that columns of 'rare' - //blocks (e.g. DSPs/RAMs) which are physically far appart but logically adjacent will be swappable even + //blocks (e.g. DSPs/RAMs) which are physically far apart but logically adjacent will be swappable even //at an rlim fo 1. // //This ensures that such blocks don't get locked down too early during placement (as would be the diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h index de3d771e7ae..ea9a90cc18d 100644 --- a/vpr/src/place/move_utils.h +++ b/vpr/src/place/move_utils.h @@ -7,6 +7,7 @@ class PlacerState; class BlkLocRegistry; +class PlacerCriticalities; namespace vtr { class RngContainer; } @@ -171,6 +172,7 @@ bool is_legal_swap_to_location(ClusterBlockId blk, ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts, int& logical_blk_type_index, bool highly_crit_block, + const PlacerCriticalities* placer_criticalities, ClusterNetId* net_from, int* pin_from, const PlacerState& placer_state, @@ -183,43 +185,32 @@ ClusterBlockId propose_block_to_move(const t_placer_opts& placer_opts, */ const std::vector& movable_blocks_per_type(const t_logical_block_type& blk_type); -/** - * @brief Select a random block to be swapped with another block - * - * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found - */ -ClusterBlockId pick_from_block(vtr::RngContainer& rng); /** * @brief Find a block with a specific block type to be swapped with another block * - * @param logical_blk_type_index: the agent type of the moving block. + * @param logical_blk_type_index The logical type of the moving block. If a negative value is passed, + * the block is selected randomly from all movable blocks and not from a specific type. + * @param rng A random number generator used to select a random block. * * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found */ ClusterBlockId pick_from_block(int logical_blk_type_index, vtr::RngContainer& rng); /** - * @brief Select a random highly critical block to be swapped with another block - * - * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found - */ -ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from, - int& pin_from, - const PlacerState& placer_state, - vtr::RngContainer& rng); - -/** - * @brief Find a block with a specific block type to be swapped with another block + * @brief Find a highly critical block with a specific block type to be swapped with another block. * - * @param logical_blk_type_index: the agent type of the moving block. + * @param logical_blk_type_index The logical type of the moving block. If a negative value is passed, + * the block is selected randomly from all movable blocks and not from a specific type. + * @param rng A random number generator used to select a random highly critical block. * - * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found + * @return BlockId of the selected block, ClusterBlockId::INVALID() if no block with specified block type found. */ ClusterBlockId pick_from_highly_critical_block(ClusterNetId& net_from, int& pin_from, int logical_blk_type_index, const PlacerState& placer_state, + const PlacerCriticalities& placer_criticalities, vtr::RngContainer& rng); bool find_to_loc_uniform(t_logical_block_type_ptr type, diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp index ac049995347..e2a8e902e31 100644 --- a/vpr/src/place/net_cost_handler.cpp +++ b/vpr/src/place/net_cost_handler.cpp @@ -34,6 +34,7 @@ #include "vtr_math.h" #include "vtr_ndmatrix.h" #include "vtr_ndoffsetmatrix.h" +#include "PlacerCriticalities.h" #include diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h index 2b8e59af88f..9fad2757681 100644 --- a/vpr/src/place/net_cost_handler.h +++ b/vpr/src/place/net_cost_handler.h @@ -7,7 +7,6 @@ #pragma once #include "place_delay_model.h" -#include "timing_place.h" #include "move_transactions.h" #include "place_util.h" #include "vtr_ndoffsetmatrix.h" @@ -15,6 +14,7 @@ #include class PlacerState; +class PlacerCriticalities; /** * @brief The method used to calculate placement cost diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 3506d00b801..69e4e1895a0 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -13,7 +13,7 @@ #include "read_xml_arch_file.h" #include "echo_files.h" #include "histogram.h" -#include "place_delay_model.h" +#include "PlacementDelayModelCreator.h" #include "move_utils.h" #include "buttons.h" @@ -65,14 +65,14 @@ void try_place(const Netlist<>& net_list, if (placer_opts.place_algorithm.is_timing_driven()) { /*do this before the initial placement to avoid messing up the initial placement */ - place_delay_model = alloc_lookups_and_delay_model(net_list, - chan_width_dist, - placer_opts, - router_opts, - det_routing_arch, - segment_inf, - directs, - is_flat); + place_delay_model = PlacementDelayModelCreator::create_delay_model(placer_opts, + router_opts, + net_list, + det_routing_arch, + segment_inf, + chan_width_dist, + directs, + is_flat); if (isEchoFileEnabled(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)) { place_delay_model->dump_echo(getEchoFileName(E_ECHO_PLACEMENT_DELTA_DELAY_MODEL)); diff --git a/vpr/src/place/place_checkpoint.cpp b/vpr/src/place/place_checkpoint.cpp index 60b009d85ae..a6e2858e577 100644 --- a/vpr/src/place/place_checkpoint.cpp +++ b/vpr/src/place/place_checkpoint.cpp @@ -1,7 +1,11 @@ + #include "place_checkpoint.h" + #include "noc_place_utils.h" #include "placer_state.h" #include "grid_block.h" +#include "PlacerCriticalities.h" +#include "PlacerSetupSlacks.h" float t_placement_checkpoint::get_cp_cpd() const { return cpd_; } diff --git a/vpr/src/place/place_delay_model.h b/vpr/src/place/place_delay_model.h deleted file mode 100644 index 0aa01385e6e..00000000000 --- a/vpr/src/place/place_delay_model.h +++ /dev/null @@ -1,260 +0,0 @@ -/** - * @file place_delay_model.h - * @brief This file contains all the class and function declarations related to - * the placer delay model. For implementations, see place_delay_model.cpp. - */ - -#pragma once -#include "vtr_ndmatrix.h" -#include "vtr_flat_map.h" -#include "vpr_types.h" -#include "router_delay_profiling.h" - -#ifndef __has_attribute -# define __has_attribute(x) 0 // Compatibility with non-clang compilers. -#endif - -#if defined(COMPILER_GCC) && defined(NDEBUG) -# define ALWAYS_INLINE inline __attribute__((__always_inline__)) -#elif defined(COMPILER_MSVC) && defined(NDEBUG) -# define ALWAYS_INLINE __forceinline -#elif __has_attribute(always_inline) -# define ALWAYS_INLINE __attribute__((always_inline)) // clang -#else -# define ALWAYS_INLINE inline -#endif - -///@brief Forward declarations. -class PlaceDelayModel; -class PlacerState; - -///@brief Initialize the placer delay model. -std::unique_ptr alloc_lookups_and_delay_model(const Netlist<>& net_list, - t_chan_width_dist chan_width_dist, - const t_placer_opts& place_opts, - const t_router_opts& router_opts, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - const std::vector& directs, - bool is_flat); - -///@brief Returns the delay of one point to point connection. -float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, - const vtr::vector_map& block_locs, - ClusterNetId net_id, - int ipin); - -///@brief Recompute all point to point delays, updating `connection_delay` matrix. -void comp_td_connection_delays(const PlaceDelayModel* delay_model, - PlacerState& placer_state); - -///@brief Abstract interface to a placement delay model. -class PlaceDelayModel { - public: - virtual ~PlaceDelayModel() = default; - - ///@brief Computes place delay model. - virtual void compute( - RouterDelayProfiler& route_profiler, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - int longest_length) - = 0; - - /** - * @brief Returns the delay estimate between the specified block pins. - * - * Either compute or read methods must be invoked before invoking delay. - */ - virtual float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const = 0; - - ///@brief Dumps the delay model to an echo file. - virtual void dump_echo(std::string filename) const = 0; - - /** - * @brief Write place delay model to specified file. - * - * May be unimplemented, in which case method should throw an exception. - */ - virtual void write(const std::string& file) const = 0; - - /** - * @brief Read place delay model from specified file. - * - * May be unimplemented, in which case method should throw an exception. - */ - virtual void read(const std::string& file) = 0; -}; - -///@brief A simple delay model based on the distance (delta) between block locations. -class DeltaDelayModel : public PlaceDelayModel { - public: - DeltaDelayModel(float min_cross_layer_delay, - bool is_flat) - : cross_layer_delay_(min_cross_layer_delay) - , is_flat_(is_flat) {} - DeltaDelayModel(float min_cross_layer_delay, - vtr::NdMatrix delta_delays, - bool is_flat) - : delays_(std::move(delta_delays)) - , cross_layer_delay_(min_cross_layer_delay) - , is_flat_(is_flat) {} - - void compute( - RouterDelayProfiler& router, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - int longest_length) override; - float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override; - void dump_echo(std::string filepath) const override; - - void read(const std::string& file) override; - void write(const std::string& file) const override; - const vtr::NdMatrix& delays() const { - return delays_; - } - - private: - vtr::NdMatrix delays_; // [0..num_layers-1][0..max_dx][0..max_dy] - float cross_layer_delay_; - /** - * @brief Indicates whether the router is a two-stage or run-flat - */ - bool is_flat_; -}; - -class OverrideDelayModel : public PlaceDelayModel { - public: - OverrideDelayModel(float min_cross_layer_delay, - bool is_flat) - : cross_layer_delay_(min_cross_layer_delay) - , is_flat_(is_flat) {} - void compute( - RouterDelayProfiler& route_profiler, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - int longest_length) override; - // returns delay from the specified (x,y) to the specified (x,y) with both endpoints on layer_num and the - // specified from and to pins - float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const override; - void dump_echo(std::string filepath) const override; - - void read(const std::string& file) override; - void write(const std::string& file) const override; - - public: //Mutators - void set_base_delay_model(std::unique_ptr base_delay_model); - const DeltaDelayModel* base_delay_model() const; - float get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const; - void set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay); - - private: - std::unique_ptr base_delay_model_; - /** - * @brief Minimum delay of cross-layer connections - */ - float cross_layer_delay_; - /** - * @brief Indicates whether the router is a two-stage or run-flat - */ - bool is_flat_; - - void compute_override_delay_model(RouterDelayProfiler& router, - const t_router_opts& router_opts); - - /** - * @brief Structure that allows delays to be queried from the delay model. - * - * Delay is calculated given the origin physical tile, the origin - * pin, the destination physical tile, and the destination pin. - * This structure encapsulates all these information. - * - * @param from_type, to_type - * Physical tile index (for easy array access) - * @param from_class, to_class - * The class that the pins belongs to. - * @param to_x, to_y - * The horizontal and vertical displacement - * between two physical tiles. - */ - struct t_override { - short from_type; - short to_type; - short from_class; - short to_class; - short delta_x; - short delta_y; - - /** - * @brief Comparison operator designed for performance. - * - * Operator< is important since t_override serves as the key into the - * map structure delay_overrides_. A default comparison operator would - * not be inlined by the compiler. - * - * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare - * is required for operator< to be inlined by compiler. Proper inlining of - * the function reduces place time by around 5%. - * - * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225 - */ - friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) { - const short* left = reinterpret_cast(&lhs); - const short* right = reinterpret_cast(&rhs); - constexpr size_t NUM_T_OVERRIDE_MEMBERS = sizeof(t_override) / sizeof(short); - return std::lexicographical_compare(left, left + NUM_T_OVERRIDE_MEMBERS, right, right + NUM_T_OVERRIDE_MEMBERS); - } - }; - - /** - * @brief Map data structure that returns delay values according to - * specific delay model queries. - * - * Delay model queries are provided by the t_override structure, which - * encapsulates the information regarding the origin and the destination. - */ - vtr::flat_map2 delay_overrides_; - - /** - * operator< treats memory layout of t_override as an array of short. - * This requires all members of t_override are shorts and there is no - * padding between members of t_override. - */ - static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)"); - static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts"); - static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts"); - static_assert(sizeof(t_override::from_class) == sizeof(short), "Expect all t_override data members to be shorts"); - static_assert(sizeof(t_override::to_class) == sizeof(short), "Expect all t_override data members to be shorts"); - static_assert(sizeof(t_override::delta_x) == sizeof(short), "Expect all t_override data members to be shorts"); - static_assert(sizeof(t_override::delta_y) == sizeof(short), "Expect all t_override data members to be shorts"); -}; - -///@brief A simple delay model based on the information stored in router lookahead -/// This is in contrast to other placement delay models that get the cost of getting from one location to another by running the router -class SimpleDelayModel : public PlaceDelayModel { - public: - SimpleDelayModel() {} - - void compute( - RouterDelayProfiler& router, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - int longest_length) override; - float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override; - void dump_echo(std::string /*filepath*/) const override {} - - void read(const std::string& /*file*/) override {} - void write(const std::string& /*file*/) const override {} - - private: - /** - * @brief The matrix to store the minimum delay between different points on different layers. - * - *The matrix used to store delay information is a 5D matrix. This data structure stores the minimum delay for each tile type on each layer to other layers - *for each dx and dy. We decided to separate the delay for each physical type on each die to accommodate cases where the connectivity of a physical type differs - *on each layer. Additionally, instead of using d_layer, we distinguish between the destination layer to handle scenarios where connectivity between layers - *is not uniform. For example, if the number of inter-layer connections between layer 1 and 2 differs from the number of connections between layer 0 and 1. - *One might argue that this variability could also occur for dx and dy. However, we are operating under the assumption that the FPGA fabric architecture is regular. - */ - vtr::NdMatrix delays_; // [0..num_physical_type-1][0..num_layers-1][0..num_layers-1][0..max_dx][0..max_dy] -}; diff --git a/vpr/src/place/placer.h b/vpr/src/place/placer.h index 99c00d7e8e5..3fb89fb20f3 100644 --- a/vpr/src/place/placer.h +++ b/vpr/src/place/placer.h @@ -20,13 +20,15 @@ #include #include -#include "timing_place.h" #include "place_checkpoint.h" #include "PlacementDelayCalculator.h" #include "placer_state.h" #include "noc_place_utils.h" #include "net_cost_handler.h" #include "placement_log_printer.h" +#include "PlacerSetupSlacks.h" +#include "PlacerCriticalities.h" +#include "NetPinTimingInvalidator.h" class PlacementAnnealer; namespace vtr{ diff --git a/vpr/src/place/placer_state.h b/vpr/src/place/placer_state.h index 8f3b966a56d..a6896a359e8 100644 --- a/vpr/src/place/placer_state.h +++ b/vpr/src/place/placer_state.h @@ -12,7 +12,7 @@ #include "vpr_context.h" #include "vpr_net_pins_matrix.h" #include "vpr_types.h" -#include "timing_place.h" +#include "PlacerTimingCosts.h" /** * @brief State relating to the timing driven data. @@ -145,9 +145,6 @@ struct PlacerMoveContext : public Context { std::vector X_coord; std::vector Y_coord; std::vector layer_coord; - - // Container to save the highly critical pins (higher than a timing criticality limit set by commandline option) - std::vector> highly_crit_pins; }; diff --git a/vpr/src/place/timing/PlacerCriticalities.cpp b/vpr/src/place/timing/PlacerCriticalities.cpp new file mode 100644 index 00000000000..1f2e4f518e9 --- /dev/null +++ b/vpr/src/place/timing/PlacerCriticalities.cpp @@ -0,0 +1,127 @@ + +#include "PlacerCriticalities.h" + +#include "timing_info.h" +#include "timing_util.h" + +PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + std::shared_ptr timing_info) + : clb_nlist_(clb_nlist) + , pin_lookup_(netlist_pin_lookup) + , timing_info_(std::move(timing_info)) + , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { +} + +/** + * @brief Updated the criticalities in the timing_place_crit_ data structure. + * + * If the criticalities are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_criticality() + * cannot accurately account for all the pins that need to be updated. In this case, + * `recompute_required` would be true, and we update all criticalities from scratch. + * + * If the criticality exponent has changed, we also need to update from scratch. + */ +void PlacerCriticalities::update_criticalities(const PlaceCritParams& crit_params) { + // If update is not enabled, exit the routine. + if (!update_enabled) { + // re-computation is required on the next iteration + recompute_required = true; + return; + } + + // Determine what pins need updating + if (!recompute_required && crit_params.crit_exponent == last_crit_exponent_) { + incr_update_criticalities(); + } else { + recompute_criticalities(); + + // Record new criticality exponent + last_crit_exponent_ = crit_params.crit_exponent; + } + + /* Performs a 1-to-1 mapping from criticality to timing_place_crit_. + * For every pin on every net (or, equivalently, for every tedge ending + * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ + + // Update the affected pins + for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) { + ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); + int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); + + float clb_pin_crit = calculate_clb_net_pin_criticality(*timing_info_, pin_lookup_, ParentPinId(size_t(clb_pin)), /*is_flat=*/false); + float new_crit = pow(clb_pin_crit, crit_params.crit_exponent); + + /* Update the highly critical pins container + * + * If the old criticality < limit and the new criticality > limit --> add this pin to the highly critical pins + * If the old criticality > limit and the new criticality < limit --> remove this pin from the highly critical pins + */ + if (!first_time_update_criticality) { + if (new_crit > crit_params.crit_limit && timing_place_crit_[clb_net][pin_index_in_net] < crit_params.crit_limit) { + highly_crit_pins.emplace_back(clb_net, pin_index_in_net); + } else if (new_crit < crit_params.crit_limit && timing_place_crit_[clb_net][pin_index_in_net] > crit_params.crit_limit) { + highly_crit_pins.erase(std::remove(highly_crit_pins.begin(), highly_crit_pins.end(), std::make_pair(clb_net, pin_index_in_net)), + highly_crit_pins.end()); + } + } else { + if (new_crit > crit_params.crit_limit) { + highly_crit_pins.emplace_back(clb_net, pin_index_in_net); + } + } + + /* The placer likes a great deal of contrast between criticalities. + * Since path criticality varies much more than timing, we "sharpen" timing + * criticality by taking it to some power, crit_exponent (between 1 and 8 by default). */ + timing_place_crit_[clb_net][pin_index_in_net] = new_crit; + } + + /* Criticalities updated. In sync with timing info. + * Can be incrementally updated on the next iteration */ + recompute_required = false; + + first_time_update_criticality = false; +} + +void PlacerCriticalities::set_recompute_required() { + recompute_required = true; +} + +void PlacerCriticalities::incr_update_criticalities() { + cluster_pins_with_modified_criticality_.clear(); + + for (AtomPinId atom_pin : timing_info_->pins_with_modified_setup_criticality()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + /* Some atom pins correspond to connections which are completely + * contained within a cluster, and hence have no corresponding + * clustered pin. */ + if (!clb_pin) continue; + + cluster_pins_with_modified_criticality_.insert(clb_pin); + } +} + +void PlacerCriticalities::recompute_criticalities() { + cluster_pins_with_modified_criticality_.clear(); + + // Non-incremental: all sink pins need updating + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_criticality_.insert(pin_id); + } + } +} + +///@brief Override the criticality of a particular connection. +void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float crit_val) { + VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)"); + VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout"); + + timing_place_crit_[net_id][ipin] = crit_val; +} + +PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const { + return vtr::make_range(cluster_pins_with_modified_criticality_); +} diff --git a/vpr/src/place/timing/PlacerCriticalities.h b/vpr/src/place/timing/PlacerCriticalities.h new file mode 100644 index 00000000000..161423dba6a --- /dev/null +++ b/vpr/src/place/timing/PlacerCriticalities.h @@ -0,0 +1,199 @@ + +#pragma once + +#include "vtr_vec_id_set.h" +#include "timing_info_fwd.h" +#include "clustered_netlist_utils.h" +#include "place_delay_model.h" +#include "vpr_net_pins_matrix.h" + +/** + * @brief Saves the placement criticality parameters + * + * crit_exponent: The criticality exponent used to sharpen the criticalities + * crit_limit: The limit to consider a pin as timing critical + */ +struct PlaceCritParams { + float crit_exponent; + float crit_limit; +}; + +/** + * @brief PlacerCriticalities returns the clustered netlist connection criticalities + * used by the placer ('sharpened' by a criticality exponent). + * + * Usage + * ===== + * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds) + * to the clustered netlist (i.e. ClusterPinIds) used during placement. + * + * Criticalities are updated by update_criticalities(), given that `update_enabled` is + * set to true. It will update criticalities based on the atom netlist connection + * criticalities provided by the passed in SetupTimingInfo. + * + * This process can be done incrementally, based on the modified connections/AtomPinIds + * returned by SetupTimingInfo. However, the set returned only reflects the connections + * changed by the last call to the timing info update. + * + * Therefore, if SetupTimingInfo is updated twice in succession without criticalities + * getting updated (update_enabled = false), the returned set cannot account for all + * the connections that have been modified. In this case, we flag `recompute_required` + * as false, and we recompute the criticalities for every connection to ensure that + * they are all up to date. Hence, each time update_setup_slacks_and_criticalities() + * is called, we assign `recompute_required` the opposite value of `update_enabled`. + * + * This class also maps/transforms the modified atom connections/pins returned by the + * timing info into modified clustered netlist connections/pins after calling + * update_criticalities(). The interface then enables users to iterate over this range + * via pins_with_modified_criticalities(). This is useful for incrementally re-calculating + * the timing costs. + * + * The criticalities of individual connections can then be queried by calling the + * criticality() member function. + * + * Implementation + * ============== + * To support incremental re-calculation, the class saves the last criticality exponent + * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same + * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated + * from scratch, since a change in exponent changes *all* criticalities. + * + * Calculating criticalities: + * All the raw setup slack values across a single clock domain are gathered + * and rated from the best to the worst in terms of criticalities. In order + * to calculate criticalities, all the slack values need to be non-negative. + * Hence, if the worst slack is negative, all the slack values are shifted + * by the value of the worst slack so that the value is at least 0. If the + * worst slack is positive, then no shift happens. + * + * The best (shifted) slack (the most positive one) will have a criticality of 0. + * The worst (shifted) slack value will have a criticality of 1. + * + * Criticalities are used to calculated timing costs for each connection. + * The formula is cost = delay * criticality. + * + * For a more detailed description on how criticalities are calculated, see + * calc_relaxed_criticality() in `timing_util.cpp`. + */ +class PlacerCriticalities { + public: //Types + typedef vtr::vec_id_set::iterator pin_iterator; + typedef vtr::vec_id_set::iterator net_iterator; + + typedef vtr::Range pin_range; + typedef vtr::Range net_range; + + public: //Lifetime + + ///@brief Allocates space for the timing_place_crit_ data structure. + PlacerCriticalities(const ClusteredNetlist& clb_nlist, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + std::shared_ptr timing_info); + + PlacerCriticalities(const PlacerCriticalities&) = delete; + PlacerCriticalities& operator=(const PlacerCriticalities&) = delete; + + public: //Accessors + ///@brief Returns the criticality of the specified connection. + float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; } + + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which + * were modified by the last call to PlacerCriticalities::update_criticalities(). + */ + pin_range pins_with_modified_criticality() const; + + /// @brief Returns a constant reference to highly critical pins + const std::vector>& get_highly_critical_pins() const { return highly_crit_pins; } + + public: //Modifiers + /** + * @brief Updates criticalities based on the atom netlist criticalities + * provided by timing_info and the provided criticality_exponent. + * + * Should consistently call this method after the most recent timing analysis to + * keep the criticalities stored in this class in sync with the timing analyzer. + * If out of sync, then the criticalities cannot be incrementally updated on + * during the next timing analysis iteration. + */ + void update_criticalities(const PlaceCritParams& crit_params); + + ///@bried Enable the recompute_required flag to enforce from scratch update. + void set_recompute_required(); + + /** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerCriticalities::incr_update_criticalities(). + */ + void recompute_criticalities(); + + ///@brief Override the criticality of a particular connection. + void set_criticality(ClusterNetId net, int ipin, float crit_val); + + ///@brief Set `update_enabled` to true. + void enable_update() { update_enabled = true; } + + ///@brief Set `update_enabled` to true. + void disable_update() { update_enabled = false; } + + private: //Data + ///@brief The clb netlist in the placement context. + const ClusteredNetlist& clb_nlist_; + + ///@brief The lookup table that maps atom pins to clb pins. + const ClusteredPinAtomPinsLookup& pin_lookup_; + + ///@brief A pointer to the setup timing analyzer + std::shared_ptr timing_info_; + + /** + * @brief The matrix that stores criticality value for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_crit_; + + /** + * The criticality exponent when update_criticalites() was last called + * (used to detect if incremental update can be used). + */ + float last_crit_exponent_ = std::numeric_limits::quiet_NaN(); + + ///@brief Set of pins with criticalities modified by last call to update_criticalities(). + vtr::vec_id_set cluster_pins_with_modified_criticality_; + + /** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to criticalities can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * criticality, rather than those marked as modified by the timing analyzer. + * + * Since timing_info uses shifted/relaxed criticality (which depends on max required + * time and worst case slacks), additional nodes may be modified when updating the + * atom pin criticalities. + */ + void incr_update_criticalities(); + + ///@brief Flag that turns on/off the update_criticalities() routine. + bool update_enabled = true; + + /** + * @brief Flag that checks if criticalities need to be recomputed for all connections. + * + * Used by the method update_criticalities(). They incremental update is not possible + * if this method wasn't called updated after the previous timing info update. + */ + bool recompute_required = true; + + /** + * @brief if this is first time to call update_criticality + * + * This can be used for incremental criticality update and also incrementally update the highly critical pins + */ + bool first_time_update_criticality = true; + + /// @brief Saves the highly critical pins (higher than a timing criticality limit set by commandline option) + std::vector> highly_crit_pins; +}; diff --git a/vpr/src/place/timing/PlacerSetupSlacks.cpp b/vpr/src/place/timing/PlacerSetupSlacks.cpp new file mode 100644 index 00000000000..3a097a582ff --- /dev/null +++ b/vpr/src/place/timing/PlacerSetupSlacks.cpp @@ -0,0 +1,92 @@ + +#include "PlacerSetupSlacks.h" + +#include "timing_util.h" +#include "timing_info.h" + + +PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + std::shared_ptr timing_info) + : clb_nlist_(clb_nlist) + , pin_lookup_(netlist_pin_lookup) + , timing_info_(std::move(timing_info)) + , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { +} + +/** + * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure. + * + * If the setup slacks are not updated immediately after each time we call + * timing_info->update(), then timing_info->pins_with_modified_setup_slack() + * cannot accurately account for all the pins that need to be updated. + * + * In this case, `recompute_required` would be true, and we update all setup slacks + * from scratch. + */ +void PlacerSetupSlacks::update_setup_slacks() { + // If update is not enabled, exit the routine. + if (!update_enabled) { + // re-computation is required on the next iteration + recompute_required = true; + return; + } + + // Determine what pins need updating + if (!recompute_required) { + incr_update_setup_slacks(); + } else { + recompute_setup_slacks(); + } + + // Update the affected pins + for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) { + ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); + int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); + + float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info_, pin_lookup_, clb_pin); + + timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack; + } + + /* Setup slacks updated. In sync with timing info. + * Can be incrementally updated on the next iteration. */ + recompute_required = false; +} + +void PlacerSetupSlacks::incr_update_setup_slacks() { + cluster_pins_with_modified_setup_slack_.clear(); + + for (AtomPinId atom_pin : timing_info_->pins_with_modified_setup_slack()) { + ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); + + //Some atom pins correspond to connections which are completely + //contained within a cluster, and hence have no corresponding + //clustered pin. + if (!clb_pin) continue; + + cluster_pins_with_modified_setup_slack_.insert(clb_pin); + } +} + +void PlacerSetupSlacks::recompute_setup_slacks() { + cluster_pins_with_modified_setup_slack_.clear(); + + // Non-incremental: all sink pins need updating + for (ClusterNetId net_id : clb_nlist_.nets()) { + for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { + cluster_pins_with_modified_setup_slack_.insert(pin_id); + } + } +} + +void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float slack_val) { + VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)"); + VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout"); + + timing_place_setup_slacks_[net_id][ipin] = slack_val; +} + +PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const { + return vtr::make_range(cluster_pins_with_modified_setup_slack_); +} diff --git a/vpr/src/place/timing/PlacerSetupSlacks.h b/vpr/src/place/timing/PlacerSetupSlacks.h new file mode 100644 index 00000000000..7ffc450e94b --- /dev/null +++ b/vpr/src/place/timing/PlacerSetupSlacks.h @@ -0,0 +1,120 @@ + +#pragma once + +#include "vtr_vec_id_set.h" +#include "timing_info_fwd.h" +#include "clustered_netlist_utils.h" +#include "place_delay_model.h" +#include "vpr_net_pins_matrix.h" + +/** + * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection. + * + * Usage + * ===== + * This class mirrors PlacerCriticalities by both its methods and its members. The only + * difference is that this class deals with RAW setup slacks returned by SetupTimingInfo + * rather than criticalities. See the documentation on PlacerCriticalities for more. + * + * RAW setup slacks are unlike criticalities. Their values are not confined between + * 0 and 1. Their values can be either positive or negative. + * + * This class also provides iterating over the clustered netlist connections/pins that + * have modified setup slacks by the last call to update_setup_slacks(). However, this + * utility is mainly used for incrementally committing the setup slack values into the + * structure `connection_setup_slack` used by many placer routines. + */ +class PlacerSetupSlacks { + public: //Types + typedef vtr::vec_id_set::iterator pin_iterator; + typedef vtr::vec_id_set::iterator net_iterator; + + typedef vtr::Range pin_range; + typedef vtr::Range net_range; + + public: //Lifetime + ///@brief Allocates space for the timing_place_setup_slacks_ data structure. + PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + std::shared_ptr timing_info); + + PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete; + PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete; + + public: //Accessors + ///@brief Returns the setup slack of the specified connection. + float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; } + + /** + * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) + * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). + */ + pin_range pins_with_modified_setup_slack() const; + + public: //Modifiers + /** + * @brief Updates setup slacks based on the atom netlist setup slacks provided + * by timing_info_. + * + * Should consistently call this method after the most recent timing analysis to + * keep the setup slacks stored in this class in sync with the timing analyzer. + * If out of sync, then the setup slacks cannot be incrementally updated on + * during the next timing analysis iteration. + */ + void update_setup_slacks(); + + ///@bried Enable the recompute_required flag to enforce from scratch update. + void set_recompute_required() { recompute_required = true; } + + ///@brief Override the setup slack of a particular connection. + void set_setup_slack(ClusterNetId net, int ipin, float slack_val); + + ///@brief Set `update_enabled` to true. + void enable_update() { update_enabled = true; } + + ///@brief Set `update_enabled` to true. + void disable_update() { update_enabled = false; } + + private: //Data + const ClusteredNetlist& clb_nlist_; + const ClusteredPinAtomPinsLookup& pin_lookup_; + std::shared_ptr timing_info_; + + /** + * @brief The matrix that stores raw setup slack values for each connection. + * + * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] + */ + ClbNetPinsMatrix timing_place_setup_slacks_; + + ///@brief Set of pins with raw setup slacks modified by last call to update_setup_slacks() + vtr::vec_id_set cluster_pins_with_modified_setup_slack_; + + /** + * @brief Collect the cluster pins which need to be updated based on the latest timing + * analysis so that incremental updates to setup slacks can be performed. + * + * Note we use the set of pins reported by the *timing_info* as having modified + * setup slacks, rather than those marked as modified by the timing analyzer. + */ + void incr_update_setup_slacks(); + + /** + * @brief Collect all the sink pins in the netlist and prepare them update. + * + * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks(). + */ + void recompute_setup_slacks(); + + ///@brief Flag that turns on/off the update_setup_slacks() routine. + bool update_enabled = true; + + /** + * @brief Flag that checks if setup slacks need to be recomputed for all connections. + * + * Used by the method update_setup_slacks(). They incremental update is not possible + * if this method wasn't called updated after the previous timing info update. + */ + bool recompute_required = true; +}; + diff --git a/vpr/src/place/timing/PlacerTimingCosts.cpp b/vpr/src/place/timing/PlacerTimingCosts.cpp new file mode 100644 index 00000000000..d8ad6afafab --- /dev/null +++ b/vpr/src/place/timing/PlacerTimingCosts.cpp @@ -0,0 +1,126 @@ + +#include "PlacerTimingCosts.h" + +PlacerTimingCosts::PlacerTimingCosts(const ClusteredNetlist& nlist) { + auto nets = nlist.nets(); + + net_start_indices_.resize(nets.size()); + + // Walk through the netlist to determine how many connections there are. + size_t iconn = 0; + for (ClusterNetId net : nets) { + // The placer always skips 'ignored' nets, so they don't affect timing + // costs, so we also skip them here + if (nlist.net_is_ignored(net)) { + net_start_indices_[net] = OPEN; + continue; + } + + // Save the starting index of the current net's connections. + // We use a -1 offset, since sinks indexed from [1..num_net_pins-1] + // (there is no timing cost associated with net drivers) + net_start_indices_[net] = iconn - 1; + + // Reserve space for all this net's connections + iconn += nlist.net_sinks(net).size(); + } + + const size_t num_connections = iconn; + + // Determine how many binary tree levels we need to have a leaf for each connection cost + size_t ilevel = 0; + while (num_nodes_in_level(ilevel) < num_connections) { + ++ilevel; + } + num_levels_ = ilevel + 1; + + size_t num_leaves = num_nodes_in_level(ilevel); + size_t num_nodes_in_previous_level = num_nodes_in_level(ilevel - 1); + + VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections"); + VTR_ASSERT_MSG(num_connections == 0 || num_nodes_in_previous_level < num_connections, + "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); + + // We don't need to store all possible leaves if we have fewer connections (i.e. bottom-right of tree is empty) + size_t last_level_unused_nodes = num_nodes_in_level(ilevel) - num_connections; + size_t num_nodes = num_nodes_up_to_level(ilevel) - last_level_unused_nodes; + + // Reserve space for connection costs and intermediate node values + connection_costs_ = std::vector(num_nodes, std::numeric_limits::quiet_NaN()); + + // The net start indices we calculated earlier didn't account for intermediate binary tree nodes + // Shift the start indices after the intermediate nodes + size_t num_intermediate_nodes = num_nodes_up_to_level(ilevel - 1); + for (ClusterNetId net : nets) { + if (nlist.net_is_ignored(net)) continue; + + net_start_indices_[net] = net_start_indices_[net] + num_intermediate_nodes; + } +} + +double PlacerTimingCosts::total_cost_recurr(size_t inode) { + // Prune out-of-tree + if (inode > connection_costs_.size() - 1) { + return 0.; + } + + //Valid pre-calculated intermediate result or valid leaf + if (!std::isnan(connection_costs_[inode])) { + return connection_costs_[inode]; + } + + //Recompute recursively + double node_cost = total_cost_recurr(left_child(inode)) + + total_cost_recurr(right_child(inode)); + + //Save intermediate cost at this node + connection_costs_[inode] = node_cost; + + return node_cost; +} + +double PlacerTimingCosts::total_cost_from_scratch(size_t inode) const { + // Prune out-of-tree + if (inode > connection_costs_.size() - 1) { + return 0.; + } + + //Recompute recursively + double node_cost = total_cost_from_scratch(left_child(inode)) + + total_cost_from_scratch(right_child(inode)); + + return node_cost; +} + +void PlacerTimingCosts::invalidate(const double* invalidated_cost) { + //Check pointer within range of internal storage + VTR_ASSERT_SAFE_MSG( + invalidated_cost >= &connection_costs_[0], + "Connection cost pointer should be after start of internal storage"); + + VTR_ASSERT_SAFE_MSG( + invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], + "Connection cost pointer should be before end of internal storage"); + + size_t icost = invalidated_cost - &connection_costs_[0]; + + VTR_ASSERT_SAFE(icost >= num_nodes_up_to_level(num_levels_ - 2)); + + //Invalidate parent intermediate costs up to root or first + //already-invalidated parent + size_t iparent = parent(icost); + + while (!std::isnan(connection_costs_[iparent])) { + //Invalidate + connection_costs_[iparent] = std::numeric_limits::quiet_NaN(); + + if (iparent == 0) { + break; //At root + } else { + //Next parent + iparent = parent(iparent); + } + } + + VTR_ASSERT_SAFE_MSG(std::isnan(connection_costs_[0]), "Invalidating any connection should have invalidated the root"); +} \ No newline at end of file diff --git a/vpr/src/place/timing/PlacerTimingCosts.h b/vpr/src/place/timing/PlacerTimingCosts.h new file mode 100644 index 00000000000..5e1415581c3 --- /dev/null +++ b/vpr/src/place/timing/PlacerTimingCosts.h @@ -0,0 +1,242 @@ + +#pragma once +#include "vtr_vec_id_set.h" +#include "timing_info_fwd.h" +#include "clustered_netlist_utils.h" +#include "place_delay_model.h" +#include "vpr_net_pins_matrix.h" + +/** + * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from: + * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]. + * + * It can be used similar to: + * + * PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct + * + * //... + * + * //Modify a connection cost + * connection_timing_costs[net_id][ipin] = new_cost; + * + * //Potentially other modifications... + * + * //Calculate the updated timing cost, of all connections, + * //incrementally based on modifications + * float total_timing_cost = connection_timing_costs.total_cost(); + * + * However behind the scenes PlacerTimingCosts tracks when connection costs are modified, + * and efficiently re-calculates the total timing cost incrementally based on the connections + * which have had their cost modified. + * + * Implementation + * ============== + * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part + * of connection_costs_. To mimic 2d-array like access PlacerTimingCosts also uses two proxy + * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy + * respectively). + * + * The first part of connection_costs_ stores intermediate sums of the connection costs for + * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary + * tree, where leaves correspond to individual connection costs and intermediate nodes the + * partial sums of the connection costs. (The binary tree is stored implicitly in the + * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary + * tree we calculate the total timing cost over all connections. + * + * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset + * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up + * to the root) which have ancestors (leaves) with modified connection costs. When the + * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost. + * Only invalidated nodes are traversed, with valid nodes just returning their previously + * calculated (and unchanged) value. + * + * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can + * be done in O(k log K) time. + * + * It is important to note that due to limited floating point precision, floating point + * arithmetic has an order dependence (due to round-off). Using a binary tree to total + * the timing connection costs allows us to incrementally update the total timing cost while + * maintaining the *same order of operations* as if it was re-computed from scratch. This + * ensures we *always* get consistent results regardless of what/when connections are changed. + * + * Proxy Classes + * ============= + * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of + * internal storage of that net's connection costs. + * + * ConnectionProxy is returned by NetProxy's operator[], and holds a reference to a particular + * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy + * supports assignment, allowing clients to modify the connection cost. It also detects if the + * assigned value differs from the previous value and if so, calls PlacerTimingCosts's + * invalidate() method on that connection cost. + * + * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN) + * so they will be re-calculated by PlacerTimingCosts' total_cost() method. + */ +class PlacerTimingCosts { + public: + PlacerTimingCosts() = default; + + PlacerTimingCosts(const ClusteredNetlist& nlist); + + /** + * @brief Proxy class representing a connection cost. + * + * Supports modification of connection cost while detecting + * changes and reporting them up to PlacerTimingCosts. + */ + class ConnectionProxy { + public: + ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost) + : timing_costs_(timing_costs) + , connection_cost_(connection_cost) {} + + ///@brief Allow clients to modify the connection cost via assignment. + ConnectionProxy& operator=(double new_cost) { + if (new_cost != connection_cost_) { + //If connection cost changed, update it, and mark it + //as invalidated + connection_cost_ = new_cost; + timing_costs_->invalidate(&connection_cost_); + } + return *this; + } + + /** + * @brief Support getting the current connection cost as a double. + * + * Useful for client code operating on the cost values (e.g. difference between costs). + */ + operator double() const { + return connection_cost_; + } + + private: + PlacerTimingCosts* timing_costs_; + double& connection_cost_; + }; + + /** + * @brief Proxy class representing the connection costs of a net. + * + * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection. + */ + class NetProxy { + public: + NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs) + : timing_costs_(timing_costs) + , net_sink_costs_(net_sink_costs) {} + + ///@brief Indexes into the specific net pin/connection. + ConnectionProxy operator[](size_t ipin) { + return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]); + } + + const ConnectionProxy operator[](size_t ipin) const { + return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]); + } + + private: + PlacerTimingCosts* timing_costs_; + double* net_sink_costs_; + }; + + ///@brief Indexes into the specific net. + NetProxy operator[](ClusterNetId net_id) { + VTR_ASSERT_SAFE(net_start_indices_[net_id] >= 0); + + double* net_connection_costs = &connection_costs_[net_start_indices_[net_id]]; + return NetProxy(this, net_connection_costs); + } + + NetProxy operator[](ClusterNetId net_id) const { + VTR_ASSERT_SAFE(net_start_indices_[net_id] >= 0); + + const double* net_connection_costs = &connection_costs_[net_start_indices_[net_id]]; + return NetProxy(const_cast(this), const_cast(net_connection_costs)); + } + + void clear() { + connection_costs_.clear(); + net_start_indices_.clear(); + } + + void swap(PlacerTimingCosts& other) { + std::swap(connection_costs_, other.connection_costs_); + std::swap(net_start_indices_, other.net_start_indices_); + std::swap(num_levels_, other.num_levels_); + } + + /** + * @brief Calculates the total cost of all connections efficiently + * in the face of modified connection costs. + */ + double total_cost() { + float cost = total_cost_recurr(0); //Root + + VTR_ASSERT_DEBUG_MSG(cost == total_cost_from_scratch(0), + "Expected incremental and from-scratch costs to be consistent"); + + return cost; + } + + private: + ///@brief Recursively calculate and update the timing cost rooted at inode. + double total_cost_recurr(size_t inode); + + double total_cost_from_scratch(size_t inode) const; + + ///@brief Friend-ed so it can call invalidate(). + friend ConnectionProxy; + + void invalidate(const double* invalidated_cost); + + static size_t left_child(size_t i) { + return 2 * i + 1; + } + + static size_t right_child(size_t i) { + return 2 * i + 2; + } + + static size_t parent(size_t i) { + return (i - 1) / 2; + } + + /** + * @brief Returns the number of nodes in ilevel'th level. + * + * If ilevel is negative, return 0, since the root shouldn't + * be counted as a leaf node candidate. + */ + static size_t num_nodes_in_level(int ilevel) { + return ilevel < 0 ? 0 : (2 << (ilevel)); + } + + ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive). + static size_t num_nodes_up_to_level(int ilevel) { + return (2 << (ilevel + 1)) - 1; + } + + private: + /** + * @brief Vector storing the implicit binary tree of connection costs. + * + * The actual connections are stored at the end of the vector + * (last level of the binary tree). The earlier portions of + * the tree are the intermediate nodes. + * + * The methods left_child()/right_child()/parent() can be used + * to traverse the tree by indices into this vector. + */ + std::vector connection_costs_; + + /** + * @brief Vector storing the indices of the first connection + * for each net in the netlist, used for indexing by net. + */ + vtr::vector net_start_indices_; + + ///@brief Number of levels in the binary tree. + size_t num_levels_ = 0; +}; diff --git a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp new file mode 100644 index 00000000000..3482cd091e0 --- /dev/null +++ b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.cpp @@ -0,0 +1,80 @@ + + +#include "PlacementDelayModelCreator.h" + +#include "place_delay_model.h" +#include "simple_delay_model.h" +#include "delta_delay_model.h" +#include "override_delay_model.h" + +#include "vtr_time.h" +#include "physical_types.h" +#include "place_and_route.h" + +static int get_longest_segment_length(std::vector& segment_inf) { + int length = 0; + + for (const t_segment_inf& seg_info : segment_inf) { + if (seg_info.length > length) { + length = seg_info.length; + } + } + + return length; +} + +std::unique_ptr +PlacementDelayModelCreator::create_delay_model(const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + const Netlist<>& net_list, + t_det_routing_arch* det_routing_arch, + std::vector& segment_inf, + t_chan_width_dist chan_width_dist, + const std::vector& directs, + bool is_flat) { + vtr::ScopedStartFinishTimer timer("Computing placement delta delay look-up"); + + t_chan_width chan_width = setup_chan_width(router_opts, chan_width_dist); + + alloc_routing_structs(chan_width, router_opts, det_routing_arch, segment_inf, directs, is_flat); + + const RouterLookahead* router_lookahead = get_cached_router_lookahead(*det_routing_arch, + router_opts.lookahead_type, + router_opts.write_router_lookahead, + router_opts.read_router_lookahead, + segment_inf, + is_flat); + + RouterDelayProfiler route_profiler(net_list, router_lookahead, is_flat); + + int longest_length = get_longest_segment_length(segment_inf); + + // now setup and compute the actual arrays + std::unique_ptr place_delay_model; + float min_cross_layer_delay = get_min_cross_layer_delay(); + + if (placer_opts.delay_model_type == PlaceDelayModelType::SIMPLE) { + place_delay_model = std::make_unique(); + } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) { + place_delay_model = std::make_unique(min_cross_layer_delay, is_flat); + } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) { + place_delay_model = std::make_unique(min_cross_layer_delay, is_flat); + } else { + VTR_ASSERT_MSG(false, "Invalid placer delay model"); + } + + if (placer_opts.read_placement_delay_lookup.empty()) { + place_delay_model->compute(route_profiler, placer_opts, router_opts, longest_length); + } else { + place_delay_model->read(placer_opts.read_placement_delay_lookup); + } + + if (!placer_opts.write_placement_delay_lookup.empty()) { + place_delay_model->write(placer_opts.write_placement_delay_lookup); + } + + // free all data structures that are no longer needed + free_routing_structs(); + + return place_delay_model; +} \ No newline at end of file diff --git a/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h new file mode 100644 index 00000000000..c92b67d4854 --- /dev/null +++ b/vpr/src/place/timing/delay_model/PlacementDelayModelCreator.h @@ -0,0 +1,30 @@ + +#pragma once + +#include +#include + +#include "netlist.h" + +class PlaceDelayModel; +struct t_placer_opts; +struct t_router_opts; +struct t_det_routing_arch; +struct t_segment_inf; +struct t_chan_width_dist; +struct t_direct_inf; + +class PlacementDelayModelCreator { + public: + // nothing to do in the constructor + PlacementDelayModelCreator() = delete; + + static std::unique_ptr create_delay_model(const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + const Netlist<>& net_list, + t_det_routing_arch* det_routing_arch, + std::vector& segment_inf, + t_chan_width_dist chan_width_dist, + const std::vector& directs, + bool is_flat); +}; diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp new file mode 100644 index 00000000000..725159406c0 --- /dev/null +++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.cpp @@ -0,0 +1,968 @@ + +#include "compute_delta_delays_utils.h" + +#include "vtr_time.h" +#include "vtr_math.h" +#include "physical_types.h" +#include "globals.h" +#include "router_delay_profiling.h" + +/// Indicates the delta delay value has not been calculated +static constexpr float UNINITIALIZED_DELTA = -1; +/// Indicates delta delay from/to an EMPTY block +static constexpr float EMPTY_DELTA = -2; +/// Indicates there is no valid delta delay +static constexpr float IMPOSSIBLE_DELTA = std::numeric_limits::infinity(); + +static vtr::NdMatrix compute_delta_delays(RouterDelayProfiler& route_profiler, + const t_placer_opts& palcer_opts, + const t_router_opts& router_opts, + bool measure_directconnect, + size_t longest_length, + bool is_flat); + +static void fix_empty_coordinates(vtr::NdMatrix& delta_delays); + +static void fill_impossible_coordinates(vtr::NdMatrix& delta_delays); + +static bool verify_delta_delays(const vtr::NdMatrix& delta_delays); + +static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler, + vtr::Matrix>& matrix, + int from_layer_num, + int to_layer_num, + int source_x, + int source_y, + int start_x, + int start_y, + int end_x, + int end_y, + const t_router_opts& router_opts, + bool measure_directconnect, + const std::set& allowed_types, + bool /*is_flat*/); + +static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& route_profiler, + vtr::Matrix>& matrix, + int from_layer_num, + int to_layer_num, + int source_x, + int source_y, + int start_x, + int start_y, + int end_x, + int end_y, + const t_router_opts& router_opts, + bool measure_directconnect, + const std::set& allowed_types, + bool is_flat); + +/** + * @brief Routes between a source and sink location to calculate the delay. + * + * This function computes the delay of a routed connection between a source and sink node + * specified by their coordinates and layers. It iterates over the best driver and sink pin + * classes to find a valid routing path and calculates the delay if a path exists. + * + * @param route_profiler Reference to the `RouterDelayProfiler` responsible for calculating routing delays. + * @param source_x The x-coordinate of the source location. + * @param source_y The y-coordinate of the source location. + * @param source_layer The layer index of the source node. + * @param sink_x The x-coordinate of the sink location. + * @param sink_y The y-coordinate of the sink location. + * @param sink_layer The layer index of the sink node. + * @param router_opts Routing options used for delay calculation. + * @param measure_directconnect If `true`, includes direct connect delays; otherwise, skips direct connections. + * + * @return The calculated routing delay. If routing fails, it returns `IMPOSSIBLE_DELTA`. + */ +static float route_connection_delay(RouterDelayProfiler& route_profiler, + int source_x, + int source_y, + int source_layer, + int sink_x, + int sink_y, + int sink_layer, + const t_router_opts& router_opts, + bool measure_directconnect); + +/** + * @brief Computes a reduced value from a vector of delay values using the specified reduction method. + * + * @param delays A reference to a vector of delay values. This vector may be modified + * (e.g., sorted) depending on the reducer used. + * @param reducer The reduction method to be applied. + * + * @return The reduced delay value. If the input vector is empty, the function + * returns `IMPOSSIBLE_DELTA`. + * + * @throws VPR_FATAL_ERROR if the reducer is unrecognized. + */ +static float delay_reduce(std::vector& delays, e_reducer reducer); + +/** + * @brief Adds a delay value to a 2D matrix of delay vectors. + * + * Updates the delay vector at position (`delta_x`, `delta_y`) in the matrix. + * If the element contains only `EMPTY_DELTA`, it is replaced with the new delay; + * otherwise, the delay is appended to the vector. + * + * @param matrix A 2D matrix of delay vectors. + * @param delta_x The x-index in the matrix. + * @param delta_y The y-index in the matrix. + * @param delay The delay value to add. + */ +static void add_delay_to_matrix(vtr::Matrix>& matrix, + int delta_x, + int delta_y, + float delay); + +/** + * @brief Computes the average delay for a routing span. + * + * This function calculates the average placement delay for a routing span starting from a + * given layer and spanning a region defined by delta x and delta y. It iteratively searches + * for valid delay values within an expanding neighborhood (starting from a distance of 1) + * around the specified delta offsets and layer, until valid values are found or + * the maximum search distance (`max_distance`) is reached. + * + * @param matrix A 4D matrix of delay values indexed by `[from_layer][to_layer][delta_x][delta_y]`. + * @param from_layer The starting layer index of the routing span. + * @param to_tile_loc A structure holding the delta offsets (`x` and `y`) and the target layer index (`layer_num`). + * @param max_distance The maximum neighborhood distance to search for valid delay values. + * + * @return The average of valid delay values within the search range. If no valid delays + * are found up to the maximum distance, the function returns `IMPOSSIBLE_DELTA`. + * + * @note The function performs a Manhattan-distance-based neighborhood search around the target location. + */ +static float find_neighboring_average(vtr::NdMatrix& matrix, + int from_layer, + t_physical_tile_loc to_tile_loc, + int max_distance); + +/***************************************************************************************/ + +static vtr::NdMatrix compute_delta_delays(RouterDelayProfiler& route_profiler, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + bool measure_directconnect, + size_t longest_length, + bool is_flat) { + + + const auto& device_ctx = g_vpr_ctx.device(); + const auto& grid = device_ctx.grid; + + const size_t num_layers = grid.get_num_layers(); + const size_t device_width = grid.width(); + const size_t device_height = grid.height(); + + /* To avoid edge effects we place the source at least 'longest_length' away + * from the device edge and route from there for all possible delta values < dimension + */ + + // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // + | | + + // + A | B | C + + // + | | + + // +-----------------\-----------------------.---------------+ + // + | | + + // + | | + + // + | | + + // + | | + + // + D | E | F + + // + | | + + // + | | + + // + | | + + // + | | + + // +-----------------*-----------------------/---------------+ + // + | | + + // + G | H | I + + // + | | + + // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // + // * = (low_x, low_y) + // . = (high_x, high_y) + // / = (high_x, low_y) + // \ = (low_x, high_y) + // + = device edge + const size_t mid_x = vtr::nint(device_width / 2); + const size_t mid_y = vtr::nint(device_height / 2); + const size_t low_x = std::min(longest_length, mid_x); + const size_t low_y = std::min(longest_length, mid_y); + const size_t high_x = (longest_length <= device_width) ? std::max(device_width - longest_length, mid_x) : mid_x; + const size_t high_y = (longest_length <= device_height) ? std::max(device_width - longest_length, mid_y) : mid_y; + + vtr::NdMatrix delta_delays({num_layers, num_layers, device_width, device_height}); + + std::set allowed_types; + if (!placer_opts.allowed_tiles_for_delay_model.empty()) { + std::vector allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ","); + allowed_types = std::set(allowed_types_vector.begin(), allowed_types_vector.end()); + } + + for (int from_layer_num = 0; from_layer_num < (int)num_layers; from_layer_num++) { + for (int to_layer_num = 0; to_layer_num < (int)num_layers; to_layer_num++) { + vtr::NdMatrix, 2> sampled_delta_delays({device_width, device_height}); + + // Find the lowest y location on the left edge with a non-empty block + int y = 0; + int x = 0; + t_physical_tile_type_ptr src_type = nullptr; + for (x = 0; x < (int)device_width; ++x) { + for (y = 0; y < (int)device_height; ++y) { + t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num}); + + if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) { + // check if the tile type is among the allowed types + if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) { + continue; + } + src_type = type; + break; + } + } + if (src_type != nullptr) { + break; + } + } + VTR_ASSERT(src_type != nullptr); + + auto generic_compute_matrix = (placer_opts.place_delta_delay_matrix_calculation_method == e_place_delta_delay_algorithm::ASTAR_ROUTE) ? generic_compute_matrix_iterative_astar : generic_compute_matrix_dijkstra_expansion; + +#ifdef VERBOSE + VTR_LOG("Computing from lower left edge (%d,%d):\n", x, y); +#endif + generic_compute_matrix(route_profiler, sampled_delta_delays, + from_layer_num, to_layer_num, + x, y, + x, y, + device_width - 1, device_height - 1, + router_opts, + measure_directconnect, allowed_types, + is_flat); + + // Find the lowest x location on the bottom edge with a non-empty block + src_type = nullptr; + for (y = 0; y < (int)device_height; ++y) { + for (x = 0; x < (int)device_width; ++x) { + t_physical_tile_type_ptr type = grid.get_physical_type({x, y, from_layer_num}); + + if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) { + // check if the tile type is among the allowed types + if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) { + continue; + } + src_type = type; + break; + } + } + if (src_type) { + break; + } + } + VTR_ASSERT(src_type != nullptr); +#ifdef VERBOSE + VTR_LOG("Computing from left bottom edge (%d,%d):\n", x, y); +#endif + generic_compute_matrix(route_profiler, sampled_delta_delays, + from_layer_num, to_layer_num, + x, y, + x, y, + device_width - 1, device_height - 1, + router_opts, + measure_directconnect, allowed_types, + is_flat); + + //Since the other delta delay values may have suffered from edge effects, + //we recalculate deltas within regions B, C, E, F +#ifdef VERBOSE + VTR_LOG("Computing from low/low:\n"); +#endif + generic_compute_matrix(route_profiler, sampled_delta_delays, + from_layer_num, to_layer_num, + low_x, low_y, + low_x, low_y, + device_width - 1, device_height - 1, + router_opts, + measure_directconnect, allowed_types, + is_flat); + + //Since the other delta delay values may have suffered from edge effects, + //we recalculate deltas within regions D, E, G, H +#ifdef VERBOSE + VTR_LOG("Computing from high/high:\n"); +#endif + generic_compute_matrix(route_profiler, sampled_delta_delays, + from_layer_num, to_layer_num, + high_x, high_y, + 0, 0, + high_x, high_y, + router_opts, + measure_directconnect, allowed_types, + is_flat); + + //Since the other delta delay values may have suffered from edge effects, + //we recalculate deltas within regions A, B, D, E +#ifdef VERBOSE + VTR_LOG("Computing from high/low:\n"); +#endif + generic_compute_matrix(route_profiler, sampled_delta_delays, + from_layer_num, to_layer_num, + high_x, low_y, + 0, low_y, + high_x, device_height - 1, + router_opts, + measure_directconnect, allowed_types, + is_flat); + + //Since the other delta delay values may have suffered from edge effects, + //we recalculate deltas within regions E, F, H, I +#ifdef VERBOSE + VTR_LOG("Computing from low/high:\n"); +#endif + generic_compute_matrix(route_profiler, sampled_delta_delays, + from_layer_num, to_layer_num, + low_x, high_y, + low_x, 0, + device_width - 1, high_y, + router_opts, + measure_directconnect, allowed_types, + is_flat); + for (size_t dx = 0; dx < sampled_delta_delays.dim_size(0); ++dx) { + for (size_t dy = 0; dy < sampled_delta_delays.dim_size(1); ++dy) { + delta_delays[from_layer_num][to_layer_num][dx][dy] = delay_reduce(sampled_delta_delays[dx][dy], placer_opts.delay_model_reducer); + } + } + } + } + + return delta_delays; +} + +static void fix_empty_coordinates(vtr::NdMatrix& delta_delays) { + // Set any empty delta's to the average of its neighbours + // + // Empty coordinates may occur if the sampling location happens to not have + // a connection at that location. However, a more thorough sampling likely + // would return a result, so we fill in the empty holes with a small + // neighbour average. + constexpr int kMaxAverageDistance = 2; + for (int from_layer = 0; from_layer < (int)delta_delays.dim_size(0); ++from_layer) { + for (int to_layer = 0; to_layer < (int)delta_delays.dim_size(1); ++to_layer) { + for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) { + for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) { + if (delta_delays[from_layer][to_layer][delta_x][delta_y] == EMPTY_DELTA) { + delta_delays[from_layer][to_layer][delta_x][delta_y] = + find_neighboring_average(delta_delays, + from_layer, + {delta_x, delta_y, to_layer}, + kMaxAverageDistance); + } + } + } + } + } +} + +static void fill_impossible_coordinates(vtr::NdMatrix& delta_delays) { + // Set any impossible delta's to the average of its neighbours + // + // Impossible coordinates may occur if an IPIN cannot be reached from the + // sampling OPIN. This might occur if the IPIN or OPIN used for sampling + // is specialized, and therefore cannot be reached via the by the pins + // sampled. Leaving this value in the delay matrix will result in invalid + // slacks if the delay matrix uses this value. + // + // A max average distance of 5 is used to provide increased effort in + // filling these gaps. It is more important to have a poor predication, + // than an invalid value and causing a slack assertion. + constexpr int kMaxAverageDistance = 5; + for (int from_layer_num = 0; from_layer_num < (int)delta_delays.dim_size(0); ++from_layer_num) { + for (int to_layer_num = 0; to_layer_num < (int)delta_delays.dim_size(1); ++to_layer_num) { + for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) { + for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) { + if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == IMPOSSIBLE_DELTA) { + delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = find_neighboring_average( + delta_delays, from_layer_num, {delta_x, delta_y, to_layer_num}, kMaxAverageDistance); + } + } + } + } + } +} + +static bool verify_delta_delays(const vtr::NdMatrix& delta_delays) { + const auto& device_ctx = g_vpr_ctx.device(); + const auto& grid = device_ctx.grid; + + for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); ++from_layer_num) { + for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); ++to_layer_num) { + for (size_t x = 0; x < grid.width(); ++x) { + for (size_t y = 0; y < grid.height(); ++y) { + float delta_delay = delta_delays[from_layer_num][to_layer_num][x][y]; + + if (delta_delay < 0.) { + VPR_ERROR(VPR_ERROR_PLACE, + "Found invalid negative delay %g for delta [%d,%d,%d,%d]", + delta_delay, from_layer_num, to_layer_num, x, y); + } + } + } + } + } + + return true; +} + +static void generic_compute_matrix_iterative_astar(RouterDelayProfiler& route_profiler, + vtr::Matrix>& matrix, + int from_layer_num, + int to_layer_num, + int source_x, + int source_y, + int start_x, + int start_y, + int end_x, + int end_y, + const t_router_opts& router_opts, + bool measure_directconnect, + const std::set& allowed_types, + bool /*is_flat*/) { + const auto& device_ctx = g_vpr_ctx.device(); + + for (int sink_x = start_x; sink_x <= end_x; sink_x++) { + for (int sink_y = start_y; sink_y <= end_y; sink_y++) { + const int delta_x = abs(sink_x - source_x); + const int delta_y = abs(sink_y - source_y); + + t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}); + t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}); + + bool src_or_target_empty = (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE + || sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE); + + bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end(); + + if (src_or_target_empty || !is_allowed_type) { + if (matrix[delta_x][delta_y].empty()) { + // Only set empty target if we don't already have a valid delta delay + matrix[delta_x][delta_y].push_back(EMPTY_DELTA); +#ifdef VERBOSE + VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n", + "EMPTY", + delta_x, delta_y, + source_x, source_y, + sink_x, sink_y); +#endif + } + } else { + // Valid start/end + float delay = route_connection_delay(route_profiler, + source_x, + source_y, + from_layer_num, + sink_x, + sink_y, + to_layer_num, + router_opts, + measure_directconnect); + +#ifdef VERBOSE + VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n", + delay, + delta_x, delta_y, + source_x, source_y, + sink_x, sink_y); +#endif + if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) { + // Overwrite empty delta + matrix[delta_x][delta_y][0] = delay; + } else { + // Collect delta + matrix[delta_x][delta_y].push_back(delay); + } + } + } + } +} + +static void generic_compute_matrix_dijkstra_expansion(RouterDelayProfiler& /*route_profiler*/, + vtr::Matrix>& matrix, + int from_layer_num, + int to_layer_num, + int source_x, + int source_y, + int start_x, + int start_y, + int end_x, + int end_y, + const t_router_opts& router_opts, + bool measure_directconnect, + const std::set& allowed_types, + bool is_flat) { + const auto& device_ctx = g_vpr_ctx.device(); + + t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}); + bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end(); + if (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE || !is_allowed_type) { + for (int sink_x = start_x; sink_x <= end_x; sink_x++) { + for (int sink_y = start_y; sink_y <= end_y; sink_y++) { + int delta_x = abs(sink_x - source_x); + int delta_y = abs(sink_y - source_y); + + if (matrix[delta_x][delta_y].empty()) { + //Only set empty target if we don't already have a valid delta delay + matrix[delta_x][delta_y].push_back(EMPTY_DELTA); +#ifdef VERBOSE + VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n", + "EMPTY", + delta_x, delta_y, + source_x, source_y, + sink_x, sink_y); +#endif + } + } + } + + return; + } + + vtr::Matrix found_matrix({matrix.dim_size(0), matrix.dim_size(1)}, false); + + auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num})); + for (int driver_ptc : best_driver_ptcs) { + VTR_ASSERT(driver_ptc != OPEN); + RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc); + + VTR_ASSERT(source_rr_node != RRNodeId::INVALID()); + auto delays = calculate_all_path_delays_from_rr_node(source_rr_node, router_opts, is_flat); + + bool path_to_all_sinks = true; + for (int sink_x = start_x; sink_x <= end_x; sink_x++) { + for (int sink_y = start_y; sink_y <= end_y; sink_y++) { + int delta_x = abs(sink_x - source_x); + int delta_y = abs(sink_y - source_y); + + if (found_matrix[delta_x][delta_y]) { + continue; + } + + t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}); + if (sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE) { + if (matrix[delta_x][delta_y].empty()) { + // Only set empty target if we don't already have a valid delta delay + matrix[delta_x][delta_y].push_back(EMPTY_DELTA); +#ifdef VERBOSE + VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n", + "EMPTY", + delta_x, delta_y, + source_x, source_y, + sink_x, sink_y); +#endif + found_matrix[delta_x][delta_y] = true; + } + } else { + bool found_a_sink = false; + auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num})); + for (int sink_ptc : best_sink_ptcs) { + VTR_ASSERT(sink_ptc != OPEN); + RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc); + + if (sink_rr_node == RRNodeId::INVALID()) + continue; + + if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) { + // Skip if we shouldn't measure direct connects and a direct connect exists + continue; + } + + if (std::isnan(delays[sink_rr_node])) { + // This sink was not found + continue; + } + +#ifdef VERBOSE + VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n", + delays[size_t(sink_rr_node)], + delta_x, delta_y, + source_x, source_y, + sink_x, sink_y); +#endif + found_matrix[delta_x][delta_y] = true; + + add_delay_to_matrix(matrix, delta_x, delta_y, delays[sink_rr_node]); + + found_a_sink = true; + break; + } + + if (!found_a_sink) { + path_to_all_sinks = false; + } + } + } + } + + if (path_to_all_sinks) { + break; + } + } + + for (int sink_x = start_x; sink_x <= end_x; sink_x++) { + for (int sink_y = start_y; sink_y <= end_y; sink_y++) { + int delta_x = abs(sink_x - source_x); + int delta_y = abs(sink_y - source_y); + if (!found_matrix[delta_x][delta_y]) { + add_delay_to_matrix(matrix, delta_x, delta_y, IMPOSSIBLE_DELTA); + VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n", + source_x, + source_y, + from_layer_num, + sink_x, + sink_y, + to_layer_num, + IMPOSSIBLE_DELTA); + } + } + } +} + +static float route_connection_delay(RouterDelayProfiler& route_profiler, + int source_x, + int source_y, + int source_layer, + int sink_x, + int sink_y, + int sink_layer, + const t_router_opts& router_opts, + bool measure_directconnect) { + //Routes between the source and sink locations and calculates the delay + + // set to known value for debug purposes + float net_delay_value = IMPOSSIBLE_DELTA; + + const auto& device_ctx = g_vpr_ctx.device(); + + bool successfully_routed = false; + + // Get the rr nodes to route between + auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, source_layer})); + auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, sink_layer})); + + for (int driver_ptc : best_driver_ptcs) { + VTR_ASSERT(driver_ptc != OPEN); + RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(source_layer, source_x, source_y, SOURCE, driver_ptc); + + VTR_ASSERT(source_rr_node != RRNodeId::INVALID()); + + for (int sink_ptc : best_sink_ptcs) { + VTR_ASSERT(sink_ptc != OPEN); + RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(sink_layer, sink_x, sink_y, SINK, sink_ptc); + + if (sink_rr_node == RRNodeId::INVALID()) + continue; + + if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) { + // Skip if we shouldn't measure direct connects and a direct connect exists + continue; + } + + successfully_routed = route_profiler.calculate_delay(source_rr_node, + sink_rr_node, + router_opts, + &net_delay_value); + + if (successfully_routed) break; + } + if (successfully_routed) break; + } + + if (!successfully_routed) { + VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n", + source_x, source_y, source_layer, sink_x, sink_y, sink_layer, net_delay_value); + } + + return net_delay_value; +} + +static float delay_reduce(std::vector& delays, e_reducer reducer) { + if (delays.empty()) { + return IMPOSSIBLE_DELTA; + } + + if (delays.size() == 1) { + return delays[0]; + } + + VTR_ASSERT(delays.size() > 1); + + float delay; + + if (reducer == e_reducer::MIN) { + auto itr = std::min_element(delays.begin(), delays.end()); + delay = *itr; + } else if (reducer == e_reducer::MAX) { + auto itr = std::max_element(delays.begin(), delays.end()); + delay = *itr; + } else if (reducer == e_reducer::MEDIAN) { + std::stable_sort(delays.begin(), delays.end()); + delay = vtr::median(delays.begin(), delays.end()); + } else if (reducer == e_reducer::ARITHMEAN) { + delay = vtr::arithmean(delays.begin(), delays.end()); + } else if (reducer == e_reducer::GEOMEAN) { + delay = vtr::geomean(delays.begin(), delays.end()); + } else { + VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unrecognized delta delay reducer"); + } + + return delay; +} + +static void add_delay_to_matrix(vtr::Matrix>& matrix, + int delta_x, + int delta_y, + float delay) { + if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) { + // Overwrite empty delta + matrix[delta_x][delta_y][0] = delay; + } else { + // Collect delta + matrix[delta_x][delta_y].push_back(delay); + } +} + +static float find_neighboring_average(vtr::NdMatrix& matrix, + int from_layer, + t_physical_tile_loc to_tile_loc, + int max_distance) { + float sum = 0.f; + int num_samples = 0; + const int endx = matrix.end_index(2); + const int endy = matrix.end_index(3); + + const int x = to_tile_loc.x; + const int y = to_tile_loc.y; + const int to_layer = to_tile_loc.layer_num; + + for (int distance = 1; distance <= max_distance; ++distance) { + for (int delx = x - distance; delx <= x + distance; delx++) { + for (int dely = y - distance; dely <= y + distance; dely++) { + // Check distance constraint + if (abs(delx - x) + abs(dely - y) > distance) { + continue; + } + + //check out of bounds + if (delx < 0 || dely < 0 || delx >= endx || dely >= endy || (delx == x && dely == y)) { + continue; + } + + if (matrix[from_layer][to_layer][delx][dely] == EMPTY_DELTA || matrix[from_layer][to_layer][delx][dely] == IMPOSSIBLE_DELTA) { + continue; + } + + sum += matrix[from_layer][to_layer][delx][dely]; + num_samples++; + } + } + + if (num_samples != 0) { + return sum / (float)num_samples; + } + } + + return IMPOSSIBLE_DELTA; +} + +/***************************************************************************************/ + +vtr::NdMatrix compute_delta_delay_model(RouterDelayProfiler& route_profiler, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + bool measure_directconnect, + int longest_length, + bool is_flat) { + vtr::ScopedStartFinishTimer timer("Computing delta delays"); + vtr::NdMatrix delta_delays = compute_delta_delays(route_profiler, + placer_opts, + router_opts, + measure_directconnect, + longest_length, + is_flat); + + const size_t num_elements = delta_delays.size(); + + // set uninitialized elements to infinity + for (size_t i = 0; i < num_elements; i++) { + if (delta_delays.get(i) == UNINITIALIZED_DELTA) { + delta_delays.get(i) = IMPOSSIBLE_DELTA; + } + } + + fix_empty_coordinates(delta_delays); + + fill_impossible_coordinates(delta_delays); + + verify_delta_delays(delta_delays); + + return delta_delays; +} + +//Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification +bool find_direct_connect_sample_locations(const t_direct_inf* direct, + t_physical_tile_type_ptr from_type, + int from_pin, + int from_pin_class, + t_physical_tile_type_ptr to_type, + int to_pin, + int to_pin_class, + RRNodeId& out_src_node, + RRNodeId& out_sink_node) { + VTR_ASSERT(from_type != nullptr); + VTR_ASSERT(to_type != nullptr); + + auto& device_ctx = g_vpr_ctx.device(); + auto& grid = device_ctx.grid; + const auto& node_lookup = device_ctx.rr_graph.node_lookup(); + + //Search the grid for an instance of from/to blocks which satisfy this direct connect offsets, + //and which has the appropriate pins + int from_x = -1; + int from_y = -1; + int from_sub_tile = -1; + int to_x = 0, to_y = 0, to_sub_tile = 0; + bool found = false; + int found_layer_num = -1; + //TODO: Function *FOR NOW* assumes that from/to blocks are at same die and have a same layer nums + for (int layer_num = 0; layer_num < grid.get_num_layers() && !found; ++layer_num) { + for (int x = 0; x < (int)grid.width() && !found; ++x) { + to_x = x + direct->x_offset; + if (to_x < 0 || to_x >= (int)grid.width()) continue; + + for (int y = 0; y < (int)grid.height() && !found; ++y) { + if (grid.get_physical_type({x, y, layer_num}) != from_type) continue; + + //Check that the from pin exists at this from location + //(with multi-width/height blocks pins may not exist at all locations) + bool from_pin_found = false; + if (direct->from_side != NUM_2D_SIDES) { + RRNodeId from_pin_rr = node_lookup.find_node(layer_num, x, y, OPIN, from_pin, direct->from_side); + from_pin_found = from_pin_rr.is_valid(); + } else { + from_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, x, y, OPIN, from_pin).empty()); + } + if (!from_pin_found) continue; + + to_y = y + direct->y_offset; + + if (to_y < 0 || to_y >= (int)grid.height()) continue; + if (grid.get_physical_type({to_x, to_y, layer_num}) != to_type) continue; + + //Check that the from pin exists at this from location + //(with multi-width/height blocks pins may not exist at all locations) + bool to_pin_found = false; + if (direct->to_side != NUM_2D_SIDES) { + RRNodeId to_pin_rr = node_lookup.find_node(layer_num, to_x, to_y, IPIN, to_pin, direct->to_side); + to_pin_found = (to_pin_rr != RRNodeId::INVALID()); + } else { + to_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, to_x, to_y, IPIN, to_pin).empty()); + } + if (!to_pin_found) continue; + + for (int sub_tile_num = 0; sub_tile_num < from_type->capacity; ++sub_tile_num) { + to_sub_tile = sub_tile_num + direct->sub_tile_offset; + + if (to_sub_tile < 0 || to_sub_tile >= to_type->capacity) continue; + + found = true; + found_layer_num = layer_num; + from_x = x; + from_y = y; + from_sub_tile = sub_tile_num; + + break; + } + } + } + } + + if (!found) { + return false; + } + + //Now have a legal instance of this direct connect + VTR_ASSERT(grid.get_physical_type({from_x, from_y, found_layer_num}) == from_type); + VTR_ASSERT(from_sub_tile < from_type->capacity); + + VTR_ASSERT(grid.get_physical_type({to_x, to_y, found_layer_num}) == to_type); + VTR_ASSERT(to_sub_tile < to_type->capacity); + + VTR_ASSERT(from_x + direct->x_offset == to_x); + VTR_ASSERT(from_y + direct->y_offset == to_y); + VTR_ASSERT(from_sub_tile + direct->sub_tile_offset == to_sub_tile); + + // Find a source/sink RR node associated with the pins of the direct + { + RRNodeId src_rr_candidate = node_lookup.find_node(found_layer_num, from_x, from_y, SOURCE, from_pin_class); + VTR_ASSERT(src_rr_candidate); + out_src_node = src_rr_candidate; + } + + { + RRNodeId sink_rr_candidate = node_lookup.find_node(found_layer_num, to_x, to_y, SINK, to_pin_class); + VTR_ASSERT(sink_rr_candidate); + out_sink_node = sink_rr_candidate; + } + + return true; +} + +std::vector get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) { + std::vector best_classes; + + //Record any non-zero Fc pins + // + //Note that we track non-zero Fc pins, since certain Fc overrides + //may apply to only a subset of wire types. This ensures we record + //which pins can potentially connect to global routing. + std::unordered_set non_zero_fc_pins; + for (const t_fc_specification& fc_spec : type->fc_specs) { + if (fc_spec.fc_value == 0) continue; + + non_zero_fc_pins.insert(fc_spec.pins.begin(), fc_spec.pins.end()); + } + + // Collect all classes of matching type which connect to general routing + for (int i = 0; i < (int)type->class_inf.size(); i++) { + if (type->class_inf[i].type == pintype) { + //Check whether all pins in this class are ignored or have zero fc + bool any_pins_connect_to_general_routing = false; + for (int ipin = 0; ipin < type->class_inf[i].num_pins; ++ipin) { + int pin = type->class_inf[i].pinlist[ipin]; + //If the pin isn't ignored, and has a non-zero Fc to some general + //routing the class is suitable for delay profiling + if (!type->is_ignored_pin[pin] && non_zero_fc_pins.count(pin)) { + any_pins_connect_to_general_routing = true; + break; + } + } + + // Skip if the pin class doesn't connect to general routing + if (!any_pins_connect_to_general_routing) continue; + + // Record candidate class + best_classes.push_back(i); + } + } + + // Sort classes so the largest pin class is first + auto cmp_class = [&](int lhs, int rhs) { + return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins; + }; + + std::stable_sort(best_classes.begin(), best_classes.end(), cmp_class); + + return best_classes; +} \ No newline at end of file diff --git a/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h new file mode 100644 index 00000000000..71ac632b149 --- /dev/null +++ b/vpr/src/place/timing/delay_model/compute_delta_delays_utils.h @@ -0,0 +1,56 @@ + +#pragma once + +#include "vtr_ndmatrix.h" +#include "physical_types.h" +#include "rr_graph_fwd.h" + +struct t_placer_opts; +struct t_router_opts; +class RouterDelayProfiler; + +vtr::NdMatrix compute_delta_delay_model(RouterDelayProfiler& route_profiler, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + bool measure_directconnect, + int longest_length, + bool is_flat); + +bool find_direct_connect_sample_locations(const t_direct_inf* direct, + t_physical_tile_type_ptr from_type, + int from_pin, + int from_pin_class, + t_physical_tile_type_ptr to_type, + int to_pin, + int to_pin_class, + RRNodeId& out_src_node, + RRNodeId& out_sink_node); + +/** + * @brief Identifies the best pin classes for delay calculation based on pin count and connectivity. + * + * This function selects pin classes of a specified type (`pintype`) from a physical tile type (`type`) + * that are suitable for delay calculations. It prioritizes pin classes with the largest number of pins + * that connect to general routing, ensuring commonly used pins are chosen for delay profiling. + * + * @param pintype The type of pins to filter. + * @param type Pointer to the physical tile type containing pin and class information. + * + * @return A vector of indices representing the selected pin classes. The classes are sorted + * in descending order based on the number of pins they contain. + * + * @details + * - A pin class is eligible if its type matches `pintype` and it contains at least one pin + * that connects to general routing (non-zero Fc). + * - Non-zero Fc pins are determined by inspecting the tile's `fc_specs`. + * - Classes are sorted so that the class with the largest number of pins appears first. + * If multiple classes have the same pin count, their order depends on their initial appearance + * in the architecture file. + * + * @note + * - Pins explicitly marked as ignored in `type->is_ignored_pin` are excluded. + * - The function ensures stability in sorting, preserving the input order for classes + * with the same number of pins. + */ + +std::vector get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type); \ No newline at end of file diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.cpp b/vpr/src/place/timing/delay_model/delta_delay_model.cpp new file mode 100644 index 00000000000..f4e202e7106 --- /dev/null +++ b/vpr/src/place/timing/delay_model/delta_delay_model.cpp @@ -0,0 +1,48 @@ + +#include "delta_delay_model.h" + +#include "compute_delta_delays_utils.h" + +void DeltaDelayModel::compute(RouterDelayProfiler& route_profiler, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + int longest_length) { + delays_ = compute_delta_delay_model(route_profiler, + placer_opts, + router_opts, + /*measure_directconnect=*/true, + longest_length, + is_flat_); +} + +float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, + const t_physical_tile_loc& to_loc, int /*to_pin*/) const { + int delta_x = std::abs(from_loc.x - to_loc.x); + int delta_y = std::abs(from_loc.y - to_loc.y); + + return delays_[from_loc.layer_num][to_loc.layer_num][delta_x][delta_y]; +} + +void DeltaDelayModel::dump_echo(std::string filepath) const { + FILE* f = vtr::fopen(filepath.c_str(), "w"); + fprintf(f, " "); + for (size_t from_layer_num = 0; from_layer_num < delays_.dim_size(0); ++from_layer_num) { + for (size_t to_layer_num = 0; to_layer_num < delays_.dim_size(1); ++to_layer_num) { + fprintf(f, " %9zu", from_layer_num); + fprintf(f, "\n"); + for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) { + fprintf(f, " %9zu", dx); + } + fprintf(f, "\n"); + for (size_t dy = 0; dy < delays_.dim_size(3); ++dy) { + fprintf(f, "%9zu", dy); + for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) { + fprintf(f, " %9.2e", delays_[from_layer_num][to_layer_num][dx][dy]); + } + fprintf(f, "\n"); + } + } + } + vtr::fclose(f); +} + diff --git a/vpr/src/place/timing/delay_model/delta_delay_model.h b/vpr/src/place/timing/delay_model/delta_delay_model.h new file mode 100644 index 00000000000..c3ae0d83cf7 --- /dev/null +++ b/vpr/src/place/timing/delay_model/delta_delay_model.h @@ -0,0 +1,47 @@ + +#pragma once + +#include "place_delay_model.h" + +/** + * @class DeltaDelayModel + * + * @brief A simple delay model based on the distance (delta) between block locations. + */ +class DeltaDelayModel : public PlaceDelayModel { + public: + DeltaDelayModel(float min_cross_layer_delay, + bool is_flat) + : cross_layer_delay_(min_cross_layer_delay) + , is_flat_(is_flat) {} + + DeltaDelayModel(float min_cross_layer_delay, + vtr::NdMatrix delta_delays, + bool is_flat) + : delays_(std::move(delta_delays)) + , cross_layer_delay_(min_cross_layer_delay) + , is_flat_(is_flat) {} + + void compute(RouterDelayProfiler& router, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + int longest_length) override; + + float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override; + + void dump_echo(std::string filepath) const override; + + void read(const std::string& file) override; + void write(const std::string& file) const override; + + const vtr::NdMatrix& delays() const { + return delays_; + } + + private: + vtr::NdMatrix delays_; // [0..num_layers-1][0..max_dx][0..max_dy] + float cross_layer_delay_; + + /// Indicates whether the router is a two-stage or run-flat + bool is_flat_; +}; \ No newline at end of file diff --git a/vpr/src/place/place_delay_model.cpp b/vpr/src/place/timing/delay_model/override_delay_model.cpp similarity index 58% rename from vpr/src/place/place_delay_model.cpp rename to vpr/src/place/timing/delay_model/override_delay_model.cpp index 4f626a5817f..d496a43b5e7 100644 --- a/vpr/src/place/place_delay_model.cpp +++ b/vpr/src/place/timing/delay_model/override_delay_model.cpp @@ -1,21 +1,7 @@ -/** - * @file place_delay_model.cpp - * @brief This file implements all the class methods and individual - * routines related to the placer delay model. - */ -#include -#include "place_delay_model.h" -#include "globals.h" -#include "router_lookahead_map.h" -#include "rr_graph2.h" +#include "override_delay_model.h" -#include "timing_place_lookup.h" -#include "placer_state.h" - -#include "vtr_log.h" -#include "vtr_math.h" -#include "vpr_error.h" +#include "compute_delta_delays_utils.h" #ifdef VTR_ENABLE_CAPNPROTO # include "capnp/serialize.h" @@ -23,48 +9,109 @@ # include "ndmatrix_serdes.h" # include "mmap_file.h" # include "serdes_utils.h" -#endif /* VTR_ENABLE_CAPNPROTO */ - -///@brief DeltaDelayModel methods. -float DeltaDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const { - int delta_x = std::abs(from_loc.x - to_loc.x); - int delta_y = std::abs(from_loc.y - to_loc.y); - - return delays_[from_loc.layer_num][to_loc.layer_num][delta_x][delta_y]; +#endif // VTR_ENABLE_CAPNPROTO + +void OverrideDelayModel::compute(RouterDelayProfiler& route_profiler, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + int longest_length) { + auto delays = compute_delta_delay_model(route_profiler, + placer_opts, + router_opts, + /*measure_directconnect=*/false, + longest_length, + is_flat_); + + base_delay_model_ = std::make_unique(cross_layer_delay_, delays, false); + + compute_override_delay_model_(route_profiler, router_opts); } -void DeltaDelayModel::dump_echo(std::string filepath) const { - FILE* f = vtr::fopen(filepath.c_str(), "w"); - fprintf(f, " "); - for (size_t from_layer_num = 0; from_layer_num < delays_.dim_size(0); ++from_layer_num) { - for (size_t to_layer_num = 0; to_layer_num < delays_.dim_size(1); ++to_layer_num) { - fprintf(f, " %9zu", from_layer_num); - fprintf(f, "\n"); - for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) { - fprintf(f, " %9zu", dx); +void OverrideDelayModel::compute_override_delay_model_(RouterDelayProfiler& route_profiler, + const t_router_opts& router_opts) { + const auto& device_ctx = g_vpr_ctx.device(); + t_router_opts router_opts2 = router_opts; + router_opts2.astar_fac = 0.f; + router_opts2.astar_offset = 0.f; + + // Look at all the direct connections that exist, and add overrides to delay model + for (int idirect = 0; idirect < (int)device_ctx.arch->directs.size(); ++idirect) { + const t_direct_inf* direct = &device_ctx.arch->directs[idirect]; + + InstPort from_port = parse_inst_port(direct->from_pin); + InstPort to_port = parse_inst_port(direct->to_pin); + + t_physical_tile_type_ptr from_type = find_tile_type_by_name(from_port.instance_name(), device_ctx.physical_tile_types); + t_physical_tile_type_ptr to_type = find_tile_type_by_name(to_port.instance_name(), device_ctx.physical_tile_types); + + int num_conns = from_port.port_high_index() - from_port.port_low_index() + 1; + VTR_ASSERT_MSG(num_conns == to_port.port_high_index() - to_port.port_low_index() + 1, "Directs must have the same size to/from"); + + //We now walk through all the connections associated with the current direct specification, measure + //their delay and specify that value as an override in the delay model. + // + //Note that we need to check every connection in the direct to cover the case where the pins are not + //equivalent. + // + //However, if the from/to ports are equivalent we could end up sampling the same RR SOURCE/SINK + //paths multiple times (wasting CPU time) -- we avoid this by recording the sampled paths in + //sampled_rr_pairs and skipping them if they occur multiple times. + int missing_instances = 0; + int missing_paths = 0; + std::set> sampled_rr_pairs; + for (int iconn = 0; iconn < num_conns; ++iconn) { + //Find the associated pins + int from_pin = from_type->find_pin(from_port.port_name(), from_port.port_low_index() + iconn); + int to_pin = to_type->find_pin(to_port.port_name(), to_port.port_low_index() + iconn); + + VTR_ASSERT(from_pin != OPEN); + VTR_ASSERT(to_pin != OPEN); + + int from_pin_class = from_type->find_pin_class(from_port.port_name(), from_port.port_low_index() + iconn, DRIVER); + VTR_ASSERT(from_pin_class != OPEN); + + int to_pin_class = to_type->find_pin_class(to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER); + VTR_ASSERT(to_pin_class != OPEN); + + bool found_sample_points; + RRNodeId src_rr, sink_rr; + found_sample_points = find_direct_connect_sample_locations(direct, from_type, from_pin, from_pin_class, to_type, to_pin, to_pin_class, src_rr, sink_rr); + + if (!found_sample_points) { + ++missing_instances; + continue; } - fprintf(f, "\n"); - for (size_t dy = 0; dy < delays_.dim_size(3); ++dy) { - fprintf(f, "%9zu", dy); - for (size_t dx = 0; dx < delays_.dim_size(2); ++dx) { - fprintf(f, " %9.2e", delays_[from_layer_num][to_layer_num][dx][dy]); - } - fprintf(f, "\n"); + + //If some of the source/sink ports are logically equivalent we may have already + //sampled the associated source/sink pair and don't need to do so again + if (sampled_rr_pairs.count({src_rr, sink_rr})) continue; + + float direct_connect_delay = std::numeric_limits::quiet_NaN(); + bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay); + + if (found_routing_path) { + set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay); + } else { + ++missing_paths; } + + //Record that we've sampled this pair of source and sink nodes + sampled_rr_pairs.insert({src_rr, sink_rr}); } + + VTR_LOGV_WARN(missing_instances > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no instances of this direct found)\n", missing_instances, direct->name.c_str()); + VTR_LOGV_WARN(missing_paths > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no routing path found)\n", missing_paths, direct->name.c_str()); } - vtr::fclose(f); } const DeltaDelayModel* OverrideDelayModel::base_delay_model() const { return base_delay_model_.get(); } -///@brief OverrideDelayModel methods. float OverrideDelayModel::delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const { - //First check to if there is an override delay value - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; + // First check to if there is an override delay value + const auto& device_ctx = g_vpr_ctx.device(); + const auto& grid = device_ctx.grid; t_physical_tile_type_ptr from_type_ptr = grid.get_physical_type(from_loc); t_physical_tile_type_ptr to_type_ptr = grid.get_physical_type(to_loc); @@ -152,14 +199,6 @@ void OverrideDelayModel::set_base_delay_model(std::unique_ptr b base_delay_model_ = std::move(base_delay_model_obj); } -float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const { - int delta_x = std::abs(from_loc.x - to_loc.x); - int delta_y = std::abs(from_loc.y - to_loc.y); - - int from_tile_idx = g_vpr_ctx.device().grid.get_physical_type(from_loc)->index; - return delays_[from_tile_idx][from_loc.layer_num][to_loc.layer_num][delta_x][delta_y]; -} - /** * When writing capnp targetted serialization, always allow compilation when * VTR_ENABLE_CAPNPROTO=OFF. Generally this means throwing an exception instead. @@ -315,89 +354,4 @@ void OverrideDelayModel::write(const std::string& file) const { writeMessageToFile(file, &builder); } -#endif - -///@brief Initialize the placer delay model. -std::unique_ptr alloc_lookups_and_delay_model(const Netlist<>& net_list, - t_chan_width_dist chan_width_dist, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - const std::vector& directs, - bool is_flat) { - return compute_place_delay_model(placer_opts, - router_opts, - net_list, - det_routing_arch, - segment_inf, - chan_width_dist, - directs, - is_flat); -} - -/** - * @brief Returns the delay of one point to point connection. - * - * Only estimate delay for signals routed through the inter-block routing network. - * TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay." - */ -float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, - const vtr::vector_map& block_locs, - ClusterNetId net_id, - int ipin) { - auto& cluster_ctx = g_vpr_ctx.clustering(); - - float delay_source_to_sink = 0.; - - if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { - ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id); - ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin); - - ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin); - ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin); - - int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin); - int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin); - - t_pl_loc source_block_loc = block_locs[source_block].loc; - t_pl_loc sink_block_loc = block_locs[sink_block].loc; - - /** - * This heuristic only considers delta_x and delta_y, a much better - * heuristic would be to to create a more comprehensive lookup table. - * - * In particular this approach does not accurately capture the effect - * of fast carry-chain connections. - */ - delay_source_to_sink = delay_model->delay({source_block_loc.x, source_block_loc.y, source_block_loc.layer}, source_block_ipin, - {sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer}, sink_block_ipin); - if (delay_source_to_sink < 0) { - VPR_ERROR(VPR_ERROR_PLACE, - "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d,%d) to %s (at %d,%d,%d)\n" - "in comp_td_single_connection_delay: Delay is less than 0\n", - block_type_pin_index_to_name(physical_tile_type(source_block_loc), source_block_ipin, false).c_str(), - source_block_loc.x, source_block_loc.y, source_block_loc.layer, - block_type_pin_index_to_name(physical_tile_type(sink_block_loc), sink_block_ipin, false).c_str(), - sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer, - delay_source_to_sink); - } - } - - return (delay_source_to_sink); -} - -///@brief Recompute all point to point delays, updating `connection_delay` matrix. -void comp_td_connection_delays(const PlaceDelayModel* delay_model, - PlacerState& placer_state) { - const auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& p_timing_ctx = placer_state.mutable_timing(); - auto& block_locs = placer_state.block_locs(); - auto& connection_delay = p_timing_ctx.connection_delay; - - for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) { - for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { - connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, block_locs, net_id, ipin); - } - } -} +#endif \ No newline at end of file diff --git a/vpr/src/place/timing/delay_model/override_delay_model.h b/vpr/src/place/timing/delay_model/override_delay_model.h new file mode 100644 index 00000000000..5965261c272 --- /dev/null +++ b/vpr/src/place/timing/delay_model/override_delay_model.h @@ -0,0 +1,112 @@ + +#pragma once + +#include "place_delay_model.h" +#include "delta_delay_model.h" + +class OverrideDelayModel : public PlaceDelayModel { + public: + OverrideDelayModel(float min_cross_layer_delay, + bool is_flat) + : cross_layer_delay_(min_cross_layer_delay) + , is_flat_(is_flat) {} + + void compute(RouterDelayProfiler& route_profiler, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + int longest_length) override; + + /** + * @brief returns delay from the specified (x,y) to the specified (x,y) with both endpoints on layer_num and the + * specified from and to pins + */ + float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const override; + + void dump_echo(std::string filepath) const override; + + void read(const std::string& file) override; + void write(const std::string& file) const override; + + public: //Mutators + void set_base_delay_model(std::unique_ptr base_delay_model); + const DeltaDelayModel* base_delay_model() const; + float get_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y) const; + void set_delay_override(int from_type, int from_class, int to_type, int to_class, int delta_x, int delta_y, float delay); + + private: + std::unique_ptr base_delay_model_; + /// Minimum delay of cross-layer connections + float cross_layer_delay_; + + /// Indicates whether the router is a two-stage or run-flat + bool is_flat_; + + void compute_override_delay_model_(RouterDelayProfiler& router, + const t_router_opts& router_opts); + + /** + * @brief Structure that allows delays to be queried from the delay model. + * + * Delay is calculated given the origin physical tile, the origin + * pin, the destination physical tile, and the destination pin. + * This structure encapsulates all these information. + * + * @param from_type, to_type + * Physical tile index (for easy array access) + * @param from_class, to_class + * The class that the pins belongs to. + * @param to_x, to_y + * The horizontal and vertical displacement + * between two physical tiles. + */ + struct t_override { + short from_type; + short to_type; + short from_class; + short to_class; + short delta_x; + short delta_y; + + /** + * @brief Comparison operator designed for performance. + * + * Operator< is important since t_override serves as the key into the + * map structure delay_overrides_. A default comparison operator would + * not be inlined by the compiler. + * + * A combination of ALWAYS_INLINE attribute and std::lexicographical_compare + * is required for operator< to be inlined by compiler. Proper inlining of + * the function reduces place time by around 5%. + * + * For more information: https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1225 + */ + friend ALWAYS_INLINE bool operator<(const t_override& lhs, const t_override& rhs) { + const short* left = reinterpret_cast(&lhs); + const short* right = reinterpret_cast(&rhs); + constexpr size_t NUM_T_OVERRIDE_MEMBERS = sizeof(t_override) / sizeof(short); + return std::lexicographical_compare(left, left + NUM_T_OVERRIDE_MEMBERS, right, right + NUM_T_OVERRIDE_MEMBERS); + } + }; + + /** + * @brief Map data structure that returns delay values according to + * specific delay model queries. + * + * Delay model queries are provided by the t_override structure, which + * encapsulates the information regarding the origin and the destination. + */ + vtr::flat_map2 delay_overrides_; + + /** + * operator< treats memory layout of t_override as an array of short. + * This requires all members of t_override are shorts and there is no + * padding between members of t_override. + */ + static_assert(sizeof(t_override) == sizeof(t_override::from_type) + sizeof(t_override::to_type) + sizeof(t_override::from_class) + sizeof(t_override::to_class) + sizeof(t_override::delta_x) + sizeof(t_override::delta_y), "Expect t_override to have a memory layout equivalent to an array of short (no padding)"); + static_assert(sizeof(t_override::from_type) == sizeof(short), "Expect all t_override data members to be shorts"); + static_assert(sizeof(t_override::to_type) == sizeof(short), "Expect all t_override data members to be shorts"); + static_assert(sizeof(t_override::from_class) == sizeof(short), "Expect all t_override data members to be shorts"); + static_assert(sizeof(t_override::to_class) == sizeof(short), "Expect all t_override data members to be shorts"); + static_assert(sizeof(t_override::delta_x) == sizeof(short), "Expect all t_override data members to be shorts"); + static_assert(sizeof(t_override::delta_y) == sizeof(short), "Expect all t_override data members to be shorts"); +}; \ No newline at end of file diff --git a/vpr/src/place/timing/delay_model/place_delay_model.cpp b/vpr/src/place/timing/delay_model/place_delay_model.cpp new file mode 100644 index 00000000000..04267e0e5f1 --- /dev/null +++ b/vpr/src/place/timing/delay_model/place_delay_model.cpp @@ -0,0 +1,78 @@ +/** + * @file place_delay_model.cpp + * @brief This file implements all the class methods and individual + * routines related to the placer delay model. + */ + +#include "place_delay_model.h" + +#include "globals.h" +#include "router_lookahead_map.h" +#include "placer_state.h" +#include "vpr_error.h" + +/** + * @brief Returns the delay of one point to point connection. + * + * Only estimate delay for signals routed through the inter-block routing network. + * TODO: Do how should we compute the delay for globals. "Global signals are assumed to have zero delay." + */ +float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, + const vtr::vector_map& block_locs, + ClusterNetId net_id, + int ipin) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + + float delay_source_to_sink = 0.; + + if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) { + ClusterPinId source_pin = cluster_ctx.clb_nlist.net_driver(net_id); + ClusterPinId sink_pin = cluster_ctx.clb_nlist.net_pin(net_id, ipin); + + ClusterBlockId source_block = cluster_ctx.clb_nlist.pin_block(source_pin); + ClusterBlockId sink_block = cluster_ctx.clb_nlist.pin_block(sink_pin); + + int source_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(source_pin); + int sink_block_ipin = cluster_ctx.clb_nlist.pin_logical_index(sink_pin); + + t_pl_loc source_block_loc = block_locs[source_block].loc; + t_pl_loc sink_block_loc = block_locs[sink_block].loc; + + /** + * This heuristic only considers delta_x and delta_y, a much better + * heuristic would be to to create a more comprehensive lookup table. + * + * In particular this approach does not accurately capture the effect + * of fast carry-chain connections. + */ + delay_source_to_sink = delay_model->delay({source_block_loc.x, source_block_loc.y, source_block_loc.layer}, source_block_ipin, + {sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer}, sink_block_ipin); + if (delay_source_to_sink < 0) { + VPR_ERROR(VPR_ERROR_PLACE, + "in comp_td_single_connection_delay: Bad delay_source_to_sink value %g from %s (at %d,%d,%d) to %s (at %d,%d,%d)\n" + "in comp_td_single_connection_delay: Delay is less than 0\n", + block_type_pin_index_to_name(physical_tile_type(source_block_loc), source_block_ipin, false).c_str(), + source_block_loc.x, source_block_loc.y, source_block_loc.layer, + block_type_pin_index_to_name(physical_tile_type(sink_block_loc), sink_block_ipin, false).c_str(), + sink_block_loc.x, sink_block_loc.y, sink_block_loc.layer, + delay_source_to_sink); + } + } + + return (delay_source_to_sink); +} + +///@brief Recompute all point to point delays, updating `connection_delay` matrix. +void comp_td_connection_delays(const PlaceDelayModel* delay_model, + PlacerState& placer_state) { + const auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& p_timing_ctx = placer_state.mutable_timing(); + auto& block_locs = placer_state.block_locs(); + auto& connection_delay = p_timing_ctx.connection_delay; + + for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) { + for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net_id).size(); ++ipin) { + connection_delay[net_id][ipin] = comp_td_single_connection_delay(delay_model, block_locs, net_id, ipin); + } + } +} diff --git a/vpr/src/place/timing/delay_model/place_delay_model.h b/vpr/src/place/timing/delay_model/place_delay_model.h new file mode 100644 index 00000000000..27c89591071 --- /dev/null +++ b/vpr/src/place/timing/delay_model/place_delay_model.h @@ -0,0 +1,80 @@ +/** + * @file place_delay_model.h + * @brief This file contains all the class and function declarations related to + * the placer delay model. For implementations, see place_delay_model.cpp. + */ + +#pragma once + +#include "vtr_ndmatrix.h" +#include "vtr_flat_map.h" +#include "vpr_types.h" +#include "router_delay_profiling.h" + +#ifndef __has_attribute +# define __has_attribute(x) 0 // Compatibility with non-clang compilers. +#endif + +#if defined(COMPILER_GCC) && defined(NDEBUG) +# define ALWAYS_INLINE inline __attribute__((__always_inline__)) +#elif defined(COMPILER_MSVC) && defined(NDEBUG) +# define ALWAYS_INLINE __forceinline +#elif __has_attribute(always_inline) +# define ALWAYS_INLINE __attribute__((always_inline)) // clang +#else +# define ALWAYS_INLINE inline +#endif + +///@brief Forward declarations. +class PlaceDelayModel; +class PlacerState; + +///@brief Returns the delay of one point to point connection. +float comp_td_single_connection_delay(const PlaceDelayModel* delay_model, + const vtr::vector_map& block_locs, + ClusterNetId net_id, + int ipin); + +///@brief Recompute all point to point delays, updating `connection_delay` matrix. +void comp_td_connection_delays(const PlaceDelayModel* delay_model, + PlacerState& placer_state); + +///@brief Abstract interface to a placement delay model. +class PlaceDelayModel { + public: + virtual ~PlaceDelayModel() = default; + + ///@brief Computes place delay model. + virtual void compute(RouterDelayProfiler& route_profiler, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + int longest_length) + = 0; + + /** + * @brief Returns the delay estimate between the specified block pins. + * + * Either compute or read methods must be invoked before invoking delay. + */ + virtual float delay(const t_physical_tile_loc& from_loc, int from_pin, const t_physical_tile_loc& to_loc, int to_pin) const = 0; + + ///@brief Dumps the delay model to an echo file. + virtual void dump_echo(std::string filename) const = 0; + + /** + * @brief Write place delay model to specified file. + * + * May be unimplemented, in which case method should throw an exception. + */ + virtual void write(const std::string& file) const = 0; + + /** + * @brief Read place delay model from specified file. + * + * May be unimplemented, in which case method should throw an exception. + */ + virtual void read(const std::string& file) = 0; +}; + + + diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.cpp b/vpr/src/place/timing/delay_model/simple_delay_model.cpp new file mode 100644 index 00000000000..0031d9eb1fe --- /dev/null +++ b/vpr/src/place/timing/delay_model/simple_delay_model.cpp @@ -0,0 +1,45 @@ + +#include "simple_delay_model.h" + + +void SimpleDelayModel::compute(RouterDelayProfiler& route_profiler, + const t_placer_opts& /*placer_opts*/, + const t_router_opts& /*router_opts*/, + int /*longest_length*/) { + const auto& grid = g_vpr_ctx.device().grid; + const size_t num_physical_tile_types = g_vpr_ctx.device().physical_tile_types.size(); + const size_t num_layers = grid.get_num_layers(); + + // Initializing the delay matrix to [num_physical_types][num_layers][num_layers][width][height] + // The second index related to the layer that the source location is on and the third index is for the sink layer + delays_ = vtr::NdMatrix({num_physical_tile_types, + num_layers, + num_layers, + grid.width(), + grid.height()}); + + for (size_t physical_tile_type_idx = 0; physical_tile_type_idx < num_physical_tile_types; ++physical_tile_type_idx) { + for (size_t from_layer = 0; from_layer < num_layers; ++from_layer) { + for (size_t to_layer = 0; to_layer < num_layers; ++to_layer) { + for (size_t dx = 0; dx < grid.width(); ++dx) { + for (size_t dy = 0; dy < grid.height(); ++dy) { + float min_delay = route_profiler.get_min_delay(physical_tile_type_idx, + from_layer, + to_layer, + dx, + dy); + delays_[physical_tile_type_idx][from_layer][to_layer][dx][dy] = min_delay; + } + } + } + } + } +} + +float SimpleDelayModel::delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const { + int delta_x = std::abs(from_loc.x - to_loc.x); + int delta_y = std::abs(from_loc.y - to_loc.y); + + int from_tile_idx = g_vpr_ctx.device().grid.get_physical_type(from_loc)->index; + return delays_[from_tile_idx][from_loc.layer_num][to_loc.layer_num][delta_x][delta_y]; +} \ No newline at end of file diff --git a/vpr/src/place/timing/delay_model/simple_delay_model.h b/vpr/src/place/timing/delay_model/simple_delay_model.h new file mode 100644 index 00000000000..f5a856688cd --- /dev/null +++ b/vpr/src/place/timing/delay_model/simple_delay_model.h @@ -0,0 +1,39 @@ + +#pragma once + +#include "place_delay_model.h" + +/** + * @class SimpleDelayModel + * @brief A simple delay model based on the information stored in router lookahead + * This is in contrast to other placement delay models that get the cost of getting from one location to another by running the router + */ +class SimpleDelayModel : public PlaceDelayModel { + public: + SimpleDelayModel() {} + + /// @brief Use the information in the router lookahead to fill the delay matrix instead of running the router + void compute(RouterDelayProfiler& router, + const t_placer_opts& placer_opts, + const t_router_opts& router_opts, + int longest_length) override; + + float delay(const t_physical_tile_loc& from_loc, int /*from_pin*/, const t_physical_tile_loc& to_loc, int /*to_pin*/) const override; + + void dump_echo(std::string /*filepath*/) const override {} + + void read(const std::string& /*file*/) override {} + void write(const std::string& /*file*/) const override {} + + private: + /** + * @brief The matrix to store the minimum delay between different points on different layers. + * + *The matrix used to store delay information is a 5D matrix. This data structure stores the minimum delay for each tile type on each layer to other layers + *for each dx and dy. We decided to separate the delay for each physical type on each die to accommodate cases where the connectivity of a physical type differs + *on each layer. Additionally, instead of using d_layer, we distinguish between the destination layer to handle scenarios where connectivity between layers + *is not uniform. For example, if the number of inter-layer connections between layer 1 and 2 differs from the number of connections between layer 0 and 1. + *One might argue that this variability could also occur for dx and dy. However, we are operating under the assumption that the FPGA fabric architecture is regular. + */ + vtr::NdMatrix delays_; // [0..num_physical_type-1][0..num_layers-1][0..num_layers-1][0..max_dx][0..max_dy] +}; \ No newline at end of file diff --git a/vpr/src/place/place_timing_update.cpp b/vpr/src/place/timing/place_timing_update.cpp similarity index 98% rename from vpr/src/place/place_timing_update.cpp rename to vpr/src/place/timing/place_timing_update.cpp index c9c53b88f90..246db01f97d 100644 --- a/vpr/src/place/place_timing_update.cpp +++ b/vpr/src/place/timing/place_timing_update.cpp @@ -3,10 +3,15 @@ * @brief Defines the routines declared in place_timing_update.h. */ -#include "vtr_time.h" - #include "place_timing_update.h" + +#include "NetPinTimingInvalidator.h" +#include "PlacerCriticalities.h" +#include "PlacerSetupSlacks.h" #include "placer_state.h" +#include "place_util.h" +#include "vtr_time.h" + /* Routines local to place_timing_update.cpp */ static double comp_td_connection_cost(const PlaceDelayModel* delay_model, @@ -94,8 +99,7 @@ void perform_full_timing_update(const PlaceCritParams& crit_params, timing_info, criticalities, setup_slacks, - pin_timing_invalidator, - placer_state); + pin_timing_invalidator); /* Update the timing cost with new connection criticalities. */ update_timing_cost(delay_model, @@ -136,13 +140,12 @@ void update_timing_classes(const PlaceCritParams& crit_params, SetupTimingInfo* timing_info, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, - NetPinTimingInvalidator* pin_timing_invalidator, - PlacerState& placer_state) { + NetPinTimingInvalidator* pin_timing_invalidator) { /* Run STA to update slacks and adjusted/relaxed criticalities. */ timing_info->update(); /* Update the placer's criticalities (e.g. sharpen with crit_exponent). */ - criticalities->update_criticalities(crit_params, placer_state); + criticalities->update_criticalities(crit_params); /* Update the placer's raw setup slacks. */ setup_slacks->update_setup_slacks(); diff --git a/vpr/src/place/place_timing_update.h b/vpr/src/place/timing/place_timing_update.h similarity index 93% rename from vpr/src/place/place_timing_update.h rename to vpr/src/place/timing/place_timing_update.h index 7944c4a7552..8e7a0dc1f46 100644 --- a/vpr/src/place/place_timing_update.h +++ b/vpr/src/place/timing/place_timing_update.h @@ -4,10 +4,15 @@ */ #pragma once -#include "timing_place.h" -#include "place_util.h" -#include "NetPinTimingInvalidator.h" +class PlacerState; +class PlaceCritParams; +class PlacerCriticalities; +class PlacerSetupSlacks; +class NetPinTimingInvalidator; +class PlaceDelayModel; +class SetupTimingInfo; +struct t_placer_costs; ///@brief Initialize the timing information and structures in the placer. void initialize_timing_info(const PlaceCritParams& crit_params, @@ -34,8 +39,7 @@ void update_timing_classes(const PlaceCritParams& crit_params, SetupTimingInfo* timing_info, PlacerCriticalities* criticalities, PlacerSetupSlacks* setup_slacks, - NetPinTimingInvalidator* pin_timing_invalidator, - PlacerState& placer_state); + NetPinTimingInvalidator* pin_timing_invalidator); ///@brief Updates the timing driven (td) costs. void update_timing_cost(const PlaceDelayModel* delay_model, diff --git a/vpr/src/place/timing_place.cpp b/vpr/src/place/timing_place.cpp deleted file mode 100644 index badd9d1fb61..00000000000 --- a/vpr/src/place/timing_place.cpp +++ /dev/null @@ -1,270 +0,0 @@ -/** - * @file timing_place.cpp - * @brief Stores the method definitions of classes defined in timing_place.h. - */ - -#include - -#include "vtr_util.h" - -#include "vpr_types.h" -#include "vpr_utils.h" -#include "net_delay.h" -#include "timing_place.h" -#include "placer_state.h" - -#include "timing_info.h" - -///@brief Allocates space for the timing_place_crit_ data structure. -PlacerCriticalities::PlacerCriticalities(const ClusteredNetlist& clb_nlist, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - std::shared_ptr timing_info) - : clb_nlist_(clb_nlist) - , pin_lookup_(netlist_pin_lookup) - , timing_info_(std::move(timing_info)) - , timing_place_crit_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { -} - -/** - * @brief Updated the criticalities in the timing_place_crit_ data structure. - * - * If the criticalities are not updated immediately after each time we call - * timing_info->update(), then timing_info->pins_with_modified_setup_criticality() - * cannot accurately account for all the pins that need to be updated. In this case, - * `recompute_required` would be true, and we update all criticalities from scratch. - * - * If the criticality exponent has changed, we also need to update from scratch. - */ -void PlacerCriticalities::update_criticalities(const PlaceCritParams& crit_params, - PlacerState& placer_state) { - /* If update is not enabled, exit the routine. */ - if (!update_enabled) { - /* re-computation is required on the next iteration */ - recompute_required = true; - return; - } - - /* Determine what pins need updating */ - if (!recompute_required && crit_params.crit_exponent == last_crit_exponent_) { - incr_update_criticalities(); - } else { - recompute_criticalities(); - - /* Record new criticality exponent */ - last_crit_exponent_ = crit_params.crit_exponent; - } - - auto& place_move_ctx = placer_state.mutable_move(); - - /* Performs a 1-to-1 mapping from criticality to timing_place_crit_. - * For every pin on every net (or, equivalently, for every tedge ending - * in that pin), timing_place_crit_ = criticality^(criticality exponent) */ - - /* Update the affected pins */ - for (ClusterPinId clb_pin : cluster_pins_with_modified_criticality_) { - ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); - int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); - // Routing for placement is not flat (at least for the time being) - float clb_pin_crit = calculate_clb_net_pin_criticality(*timing_info_, pin_lookup_, ParentPinId(size_t(clb_pin)), /*is_flat=*/false); - - float new_crit = pow(clb_pin_crit, crit_params.crit_exponent); - /* - * Update the highly critical pins container - * - * If the old criticality < limit and the new criticality > limit --> add this pin to the highly critical pins - * If the old criticality > limit and the new criticality < limit --> remove this pin from the highly critical pins - */ - if (!first_time_update_criticality) { - if (new_crit > crit_params.crit_limit && timing_place_crit_[clb_net][pin_index_in_net] < crit_params.crit_limit) { - place_move_ctx.highly_crit_pins.emplace_back(clb_net, pin_index_in_net); - } else if (new_crit < crit_params.crit_limit && timing_place_crit_[clb_net][pin_index_in_net] > crit_params.crit_limit) { - place_move_ctx.highly_crit_pins.erase(std::remove(place_move_ctx.highly_crit_pins.begin(), place_move_ctx.highly_crit_pins.end(), std::make_pair(clb_net, pin_index_in_net)), - place_move_ctx.highly_crit_pins.end()); - } - } else { - if (new_crit > crit_params.crit_limit) { - place_move_ctx.highly_crit_pins.emplace_back(clb_net, pin_index_in_net); - } - } - - /* The placer likes a great deal of contrast between criticalities. - * Since path criticality varies much more than timing, we "sharpen" timing - * criticality by taking it to some power, crit_exponent (between 1 and 8 by default). */ - timing_place_crit_[clb_net][pin_index_in_net] = new_crit; - } - - /* Criticalities updated. In sync with timing info. */ - /* Can be incrementally updated on the next iteration */ - recompute_required = false; - - first_time_update_criticality = false; -} - -void PlacerCriticalities::set_recompute_required() { - recompute_required = true; -} - -/** - * @brief Collect the cluster pins which need to be updated based on the latest timing - * analysis so that incremental updates to criticalities can be performed. - * - * Note we use the set of pins reported by the *timing_info* as having modified - * criticality, rather than those marked as modified by the timing analyzer. - * - * Since timing_info uses shifted/relaxed criticality (which depends on max required - * time and worst case slacks), additional nodes may be modified when updating the - * atom pin criticalities. - */ - -void PlacerCriticalities::incr_update_criticalities() { - cluster_pins_with_modified_criticality_.clear(); - - for (AtomPinId atom_pin : timing_info_->pins_with_modified_setup_criticality()) { - ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); - - //Some atom pins correspond to connections which are completely - //contained within a cluster, and hence have no corresponding - //clustered pin. - if (!clb_pin) continue; - - cluster_pins_with_modified_criticality_.insert(clb_pin); - } -} - -/** - * @brief Collect all the sink pins in the netlist and prepare them update. - * - * For the incremental version, see PlacerCriticalities::incr_update_criticalities(). - */ -void PlacerCriticalities::recompute_criticalities() { - cluster_pins_with_modified_criticality_.clear(); - - /* Non-incremental: all sink pins need updating */ - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_criticality_.insert(pin_id); - } - } -} - -///@brief Override the criticality of a particular connection. -void PlacerCriticalities::set_criticality(ClusterNetId net_id, int ipin, float crit_val) { - VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)"); - VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout"); - - timing_place_crit_[net_id][ipin] = crit_val; -} - -/** - * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which - * were modified by the last call to PlacerCriticalities::update_criticalities(). - */ -PlacerCriticalities::pin_range PlacerCriticalities::pins_with_modified_criticality() const { - return vtr::make_range(cluster_pins_with_modified_criticality_); -} - -/**************************************/ - -///@brief Allocates space for the timing_place_setup_slacks_ data structure. -PlacerSetupSlacks::PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - std::shared_ptr timing_info) - : clb_nlist_(clb_nlist) - , pin_lookup_(netlist_pin_lookup) - , timing_info_(std::move(timing_info)) - , timing_place_setup_slacks_(make_net_pins_matrix(clb_nlist_, std::numeric_limits::quiet_NaN())) { -} - -/** - * @brief Updated the setup slacks in the timing_place_setup_slacks_ data structure. - * - * If the setup slacks are not updated immediately after each time we call - * timing_info->update(), then timing_info->pins_with_modified_setup_slack() - * cannot accurately account for all the pins that need to be updated. - * - * In this case, `recompute_required` would be true, and we update all setup slacks - * from scratch. - */ -void PlacerSetupSlacks::update_setup_slacks() { - /* If update is not enabled, exit the routine. */ - if (!update_enabled) { - /* re-computation is required on the next iteration */ - recompute_required = true; - return; - } - - /* Determine what pins need updating */ - if (!recompute_required) { - incr_update_setup_slacks(); - } else { - recompute_setup_slacks(); - } - - /* Update the affected pins */ - for (ClusterPinId clb_pin : cluster_pins_with_modified_setup_slack_) { - ClusterNetId clb_net = clb_nlist_.pin_net(clb_pin); - int pin_index_in_net = clb_nlist_.pin_net_index(clb_pin); - - float clb_pin_setup_slack = calculate_clb_net_pin_setup_slack(*timing_info_, pin_lookup_, clb_pin); - - timing_place_setup_slacks_[clb_net][pin_index_in_net] = clb_pin_setup_slack; - } - - /* Setup slacks updated. In sync with timing info. */ - /* Can be incrementally updated on the next iteration. */ - recompute_required = false; -} - -/** - * @brief Collect the cluster pins which need to be updated based on the latest timing - * analysis so that incremental updates to setup slacks can be performed. - * - * Note we use the set of pins reported by the *timing_info* as having modified - * setup slacks, rather than those marked as modified by the timing analyzer. - */ -void PlacerSetupSlacks::incr_update_setup_slacks() { - cluster_pins_with_modified_setup_slack_.clear(); - - for (AtomPinId atom_pin : timing_info_->pins_with_modified_setup_slack()) { - ClusterPinId clb_pin = pin_lookup_.connected_clb_pin(atom_pin); - - //Some atom pins correspond to connections which are completely - //contained within a cluster, and hence have no corresponding - //clustered pin. - if (!clb_pin) continue; - - cluster_pins_with_modified_setup_slack_.insert(clb_pin); - } -} - -/** - * @brief Collect all the sink pins in the netlist and prepare them update. - * - * For the incremental version, see PlacerSetupSlacks::incr_update_setup_slacks(). - */ -void PlacerSetupSlacks::recompute_setup_slacks() { - cluster_pins_with_modified_setup_slack_.clear(); - - /* Non-incremental: all sink pins need updating */ - for (ClusterNetId net_id : clb_nlist_.nets()) { - for (ClusterPinId pin_id : clb_nlist_.net_sinks(net_id)) { - cluster_pins_with_modified_setup_slack_.insert(pin_id); - } - } -} - -///@brief Override the setup slack of a particular connection. -void PlacerSetupSlacks::set_setup_slack(ClusterNetId net_id, int ipin, float slack_val) { - VTR_ASSERT_SAFE_MSG(ipin > 0, "The pin should not be a driver pin (ipin != 0)"); - VTR_ASSERT_SAFE_MSG(ipin < int(clb_nlist_.net_pins(net_id).size()), "The pin index in net should be smaller than fanout"); - - timing_place_setup_slacks_[net_id][ipin] = slack_val; -} - -/** - * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) - * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). - */ -PlacerSetupSlacks::pin_range PlacerSetupSlacks::pins_with_modified_setup_slack() const { - return vtr::make_range(cluster_pins_with_modified_setup_slack_); -} diff --git a/vpr/src/place/timing_place.h b/vpr/src/place/timing_place.h deleted file mode 100644 index 71e144334ad..00000000000 --- a/vpr/src/place/timing_place.h +++ /dev/null @@ -1,647 +0,0 @@ -/** - * @file timing_place.h - * @brief Interface used by the VPR placer to query information - * from the Tatum timing analyzer. - * - * @class PlacerSetupSlacks - * Queries connection **RAW** setup slacks, which can - * range from negative to positive values. Also maps - * atom pin setup slacks to clb pin setup slacks. - * @class PlacerCriticalities - * Query connection criticalities, which are calculuated - * based on the raw setup slacks and ranges from 0 to 1. - * Also maps atom pin crit. to clb pin crit. - * @class PlacerTimingCosts - * Hierarchical structure used by update_td_costs() to - * maintain the order of addition operation of float values - * (to avoid round-offs) while doing incremental updates. - * - * Calculating criticalities: - * All the raw setup slack values across a single clock domain are gathered - * and rated from the best to the worst in terms of criticalities. In order - * to calculate criticalities, all the slack values need to be non-negative. - * Hence, if the worst slack is negative, all the slack values are shifted - * by the value of the worst slack so that the value is at least 0. If the - * worst slack is positive, then no shift happens. - * - * The best (shifted) slack (the most positive one) will have a criticality of 0. - * The worst (shifted) slack value will have a criticality of 1. - * - * Criticalities are used to calculated timing costs for each connection. - * The formula is cost = delay * criticality. - * - * For a more detailed description on how criticalities are calculated, see - * calc_relaxed_criticality() in `timing_util.cpp`. - */ - -#pragma once -#include "vtr_vec_id_set.h" -#include "timing_info_fwd.h" -#include "clustered_netlist_utils.h" -#include "place_delay_model.h" -#include "vpr_net_pins_matrix.h" - -/** - * @brief Saves the placement criticality parameters - * - * crit_exponent: The criticality exponent used to sharpen the criticalities - * crit_limit: The limit to consider a pin as timing critical - */ -struct PlaceCritParams { - float crit_exponent; - float crit_limit; -}; - -/** - * @brief PlacerCriticalities returns the clustered netlist connection criticalities - * used by the placer ('sharpened' by a criticality exponent). - * - * Usage - * ===== - * This class also serves to map atom netlist level criticalites (i.e. on AtomPinIds) - * to the clustered netlist (i.e. ClusterPinIds) used during placement. - * - * Criticalities are updated by update_criticalities(), given that `update_enabled` is - * set to true. It will update criticalities based on the atom netlist connection - * criticalities provided by the passed in SetupTimingInfo. - * - * This process can be done incrementally, based on the modified connections/AtomPinIds - * returned by SetupTimingInfo. However, the set returned only reflects the connections - * changed by the last call to the timing info update. - * - * Therefore, if SetupTimingInfo is updated twice in succession without criticalities - * getting updated (update_enabled = false), the returned set cannot account for all - * the connections that have been modified. In this case, we flag `recompute_required` - * as false, and we recompute the criticalities for every connection to ensure that - * they are all up to date. Hence, each time update_setup_slacks_and_criticalities() - * is called, we assign `recompute_required` the opposite value of `update_enabled`. - * - * This class also maps/transforms the modified atom connections/pins returned by the - * timing info into modified clustered netlist connections/pins after calling - * update_criticalities(). The interface then enables users to iterate over this range - * via pins_with_modified_criticalities(). This is useful for incrementally re-calculating - * the timing costs. - * - * The criticalities of individual connections can then be queried by calling the - * criticality() member function. - * - * Implementation - * ============== - * To support incremental re-calculation, the class saves the last criticality exponent - * passed to PlacerCriticalities::update_criticalites(). If the next update uses the same - * exponent, criticalities can be incrementally updated. Otherwise, they must be re-calculated - * from scratch, since a change in exponent changes *all* criticalities. - */ -class PlacerCriticalities { - public: //Types - typedef vtr::vec_id_set::iterator pin_iterator; - typedef vtr::vec_id_set::iterator net_iterator; - - typedef vtr::Range pin_range; - typedef vtr::Range net_range; - - public: //Lifetime - PlacerCriticalities(const ClusteredNetlist& clb_nlist, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - std::shared_ptr timing_info); - PlacerCriticalities(const PlacerCriticalities&) = delete; - PlacerCriticalities& operator=(const PlacerCriticalities&) = delete; - - public: //Accessors - ///@brief Returns the criticality of the specified connection. - float criticality(ClusterNetId net, int ipin) const { return timing_place_crit_[net][ipin]; } - - /** - * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) which - * were modified by the last call to PlacerCriticalities::update_criticalities(). - */ - pin_range pins_with_modified_criticality() const; - - public: //Modifiers - /** - * @brief Updates criticalities based on the atom netlist criticalitites - * provided by timing_info and the provided criticality_exponent. - * - * Should consistently call this method after the most recent timing analysis to - * keep the criticalities stored in this class in sync with the timing analyzer. - * If out of sync, then the criticalities cannot be incrementally updated on - * during the next timing analysis iteration. - */ - void update_criticalities(const PlaceCritParams& crit_params, - PlacerState& placer_state); - - ///@bried Enable the recompute_required flag to enforce from scratch update. - void set_recompute_required(); - - ///@brief From scratch update. See timing_place.cpp for more. - void recompute_criticalities(); - - ///@brief Override the criticality of a particular connection. - void set_criticality(ClusterNetId net, int ipin, float crit_val); - - ///@brief Set `update_enabled` to true. - void enable_update() { update_enabled = true; } - - ///@brief Set `update_enabled` to true. - void disable_update() { update_enabled = false; } - - private: //Data - ///@brief The clb netlist in the placement context. - const ClusteredNetlist& clb_nlist_; - - ///@brief The lookup table that maps atom pins to clb pins. - const ClusteredPinAtomPinsLookup& pin_lookup_; - - ///@brief A pointer to the setup timing analyzer - std::shared_ptr timing_info_; - - /** - * @brief The matrix that stores criticality value for each connection. - * - * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] - */ - ClbNetPinsMatrix timing_place_crit_; - - /** - * The criticality exponent when update_criticalites() was last called - * (used to detect if incremental update can be used). - */ - float last_crit_exponent_ = std::numeric_limits::quiet_NaN(); - - ///@brief Set of pins with criticaltites modified by last call to update_criticalities(). - vtr::vec_id_set cluster_pins_with_modified_criticality_; - - ///@brief Incremental update. See timing_place.cpp for more. - void incr_update_criticalities(); - - ///@brief Flag that turns on/off the update_criticalities() routine. - bool update_enabled = true; - - /** - * @brief Flag that checks if criticalities need to be recomputed for all connections. - * - * Used by the method update_criticalities(). They incremental update is not possible - * if this method wasn't called updated after the previous timing info update. - */ - bool recompute_required = true; - - /** - * @brief if this is first time to call update_criticality - * - * This can be used for incremental criticality update and also incrementally update the highly critical pins - */ - bool first_time_update_criticality = true; -}; - -/** - * @brief PlacerSetupSlacks returns the RAW setup slacks of clustered netlist connection. - * - * Usage - * ===== - * This class mirrors PlacerCriticalities by both its methods and its members. The only - * difference is that this class deals with RAW setup slacks returned by SetupTimingInfo - * rather than criticalities. See the documentation on PlacerCriticalities for more. - * - * RAW setup slacks are unlike criticalities. Their values are not confined between - * 0 and 1. Their values can be either positive or negative. - * - * This class also provides iterating over the clustered netlist connections/pins that - * have modified setup slacks by the last call to update_setup_slacks(). However, this - * utility is mainly used for incrementally committing the setup slack values into the - * structure `connection_setup_slack` used by many placer routines. - */ -class PlacerSetupSlacks { - public: //Types - typedef vtr::vec_id_set::iterator pin_iterator; - typedef vtr::vec_id_set::iterator net_iterator; - - typedef vtr::Range pin_range; - typedef vtr::Range net_range; - - public: //Lifetime - PlacerSetupSlacks(const ClusteredNetlist& clb_nlist, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - std::shared_ptr timing_info); - PlacerSetupSlacks(const PlacerSetupSlacks& clb_nlist) = delete; - PlacerSetupSlacks& operator=(const PlacerSetupSlacks& clb_nlist) = delete; - - public: //Accessors - ///@brief Returns the setup slack of the specified connection. - float setup_slack(ClusterNetId net, int ipin) const { return timing_place_setup_slacks_[net][ipin]; } - - /** - * @brief Returns the range of clustered netlist pins (i.e. ClusterPinIds) - * which were modified by the last call to PlacerSetupSlacks::update_setup_slacks(). - */ - pin_range pins_with_modified_setup_slack() const; - - public: //Modifiers - /** - * @brief Updates setup slacks based on the atom netlist setup slacks provided - * by timing_info_. - * - * Should consistently call this method after the most recent timing analysis to - * keep the setup slacks stored in this class in sync with the timing analyzer. - * If out of sync, then the setup slacks cannot be incrementally updated on - * during the next timing analysis iteration. - */ - void update_setup_slacks(); - - ///@bried Enable the recompute_required flag to enforce from scratch update. - void set_recompute_required() { recompute_required = true; } - - ///@brief Override the setup slack of a particular connection. - void set_setup_slack(ClusterNetId net, int ipin, float slack_val); - - ///@brief Set `update_enabled` to true. - void enable_update() { update_enabled = true; } - - ///@brief Set `update_enabled` to true. - void disable_update() { update_enabled = false; } - - private: //Data - const ClusteredNetlist& clb_nlist_; - const ClusteredPinAtomPinsLookup& pin_lookup_; - std::shared_ptr timing_info_; - - /** - * @brief The matrix that stores raw setup slack values for each connection. - * - * Index range: [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1] - */ - ClbNetPinsMatrix timing_place_setup_slacks_; - - ///@brief Set of pins with raw setup slacks modified by last call to update_setup_slacks() - vtr::vec_id_set cluster_pins_with_modified_setup_slack_; - - ///@brief Incremental update. See timing_place.cpp for more. - void incr_update_setup_slacks(); - - ///@brief Incremental update. See timing_place.cpp for more. - void recompute_setup_slacks(); - - ///@brief Flag that turns on/off the update_setup_slacks() routine. - bool update_enabled = true; - - /** - * @brief Flag that checks if setup slacks need to be recomputed for all connections. - * - * Used by the method update_setup_slacks(). They incremental update is not possible - * if this method wasn't called updated after the previous timing info update. - */ - bool recompute_required = true; -}; - -/** - * @brief PlacerTimingCosts mimics a 2D array of connection timing costs running from: - * [0..cluster_ctx.clb_nlist.nets().size()-1][1..num_pins-1]. - * - * It can be used similar to: - * - * PlacerTimingCosts connection_timing_costs(cluster_ctx.clb_nlist); //Construct - * - * //... - * - * //Modify a connection cost - * connection_timing_costs[net_id][ipin] = new_cost; - * - * //Potentially other modifications... - * - * //Calculate the updated timing cost, of all connections, - * //incrementally based on modifications - * float total_timing_cost = connection_timing_costs.total_cost(); - * - * However behind the scenes PlacerTimingCosts tracks when connection costs are modified, - * and efficiently re-calculates the total timing cost incrementally based on the connections - * which have had their cost modified. - * - * Implementation - * ============== - * Internally, PlacerTimingCosts stores all connection costs in a flat array in the last part - * of connection_costs_. To mimic 2d-array like access PlacerTimingCosts also uses two proxy - * classes which allow indexing in the net and pin dimensions (NetProxy and ConnectionProxy - * respectively). - * - * The first part of connection_costs_ stores intermediate sums of the connection costs for - * efficient incremental re-calculation. More concretely, connection_costs_ stores a binary - * tree, where leaves correspond to individual connection costs and intermediate nodes the - * partial sums of the connection costs. (The binary tree is stored implicitly in the - * connection_costs_ vector, using Eytzinger's/BFS layout.) By summing the entire binary - * tree we calculate the total timing cost over all connections. - * - * Using a binary tree allows us to efficiently re-calculate the timing costs when only a subset - * of connections are changed. This is done by 'invalidating' intermediate nodes (from leaves up - * to the root) which have ancestors (leaves) with modified connection costs. When the - * total_cost() method is called, it recursively walks the binary tree to re-calculate the cost. - * Only invalidated nodes are traversed, with valid nodes just returning their previously - * calculated (and unchanged) value. - * - * For a circuit with 'K' connections, of which 'k' have changed (typically k << K), this can - * be done in O(k log K) time. - * - * It is important to note that due to limited floating point precision, floating point - * arithmetic has an order dependence (due to round-off). Using a binary tree to total - * the timing connection costs allows us to incrementally update the total timing cost while - * maintianing the *same order of operations* as if it was re-computed from scratch. This - * ensures we *always* get consistent results regardless of what/when connections are changed. - * - * Proxy Classes - * ============= - * NetProxy is returned by PlacerTimingCost's operator[], and stores a pointer to the start of - * internal storage of that net's connection costs. - * - * ConnectionProxy is returned by NetProxy's operator[], and holds a reference to a particular - * element of the internal storage pertaining to a specific connection's cost. ConnectionProxy - * supports assignment, allowing clients to modify the connection cost. It also detects if the - * assigned value differs from the previous value and if so, calls PlacerTimingCosts's - * invalidate() method on that connection cost. - * - * PlacerTimingCosts's invalidate() method marks the cost element's ancestors as invalid (NaN) - * so they will be re-calculated by PlacerTimingCosts' total_cost() method. - */ -class PlacerTimingCosts { - public: - PlacerTimingCosts() = default; - - PlacerTimingCosts(const ClusteredNetlist& nlist) { - auto nets = nlist.nets(); - - net_start_indicies_.resize(nets.size()); - - //Walk through the netlist to determine how many connections there are. - size_t iconn = 0; - for (ClusterNetId net : nets) { - //The placer always skips 'ignored' nets, so they don't affect timing - //costs, so we also skip them here - if (nlist.net_is_ignored(net)) { - net_start_indicies_[net] = OPEN; - continue; - } - - //Save the startind index of the current net's connections. - // We use a -1 offset, since sinks indexed from [1..num_net_pins-1] - // (there is no timing cost associated with net drivers) - net_start_indicies_[net] = iconn - 1; - - //Reserve space for all this net's connections - iconn += nlist.net_sinks(net).size(); - } - - size_t num_connections = iconn; - - //Determine how many binary tree levels we need to have a leaf - //for each connection cost - size_t ilevel = 0; - while (num_nodes_in_level(ilevel) < num_connections) { - ++ilevel; - } - num_levels_ = ilevel + 1; - - size_t num_leaves = num_nodes_in_level(ilevel); - size_t num_level_before_leaves = num_nodes_in_level(ilevel - 1); - - VTR_ASSERT_MSG(num_leaves >= num_connections, "Need at least as many leaves as connections"); - VTR_ASSERT_MSG( - num_connections == 0 || num_level_before_leaves < num_connections, - "Level before should have fewer nodes than connections (to ensure using the smallest binary tree)"); - - //We don't need to store all possible leaves if we have fewer connections - //(i.e. bottom-right of tree is empty) - size_t last_level_unused_nodes = num_nodes_in_level(ilevel) - num_connections; - size_t num_nodes = num_nodes_up_to_level(ilevel) - last_level_unused_nodes; - - //Reserve space for connection costs and intermediate node values - connection_costs_ = std::vector(num_nodes, std::numeric_limits::quiet_NaN()); - - //The net start indicies we calculated earlier didn't account for intermediate binary tree nodes - //Shift the start indicies after the intermediate nodes - size_t num_intermediate_nodes = num_nodes_up_to_level(ilevel - 1); - for (ClusterNetId net : nets) { - if (nlist.net_is_ignored(net)) continue; - - net_start_indicies_[net] = net_start_indicies_[net] + num_intermediate_nodes; - } - } - - /** - * @brief Proxy class representing a connection cost. - * - * Supports modification of connection cost while detecting - * changes and reporting them up to PlacerTimingCosts. - */ - class ConnectionProxy { - public: - ConnectionProxy(PlacerTimingCosts* timing_costs, double& connection_cost) - : timing_costs_(timing_costs) - , connection_cost_(connection_cost) {} - - ///@brief Allow clients to modify the connection cost via assignment. - ConnectionProxy& operator=(double new_cost) { - if (new_cost != connection_cost_) { - //If connection cost changed, update it, and mark it - //as invalidated - connection_cost_ = new_cost; - timing_costs_->invalidate(&connection_cost_); - } - return *this; - } - - /** - * @brief Support getting the current connection cost as a double. - * - * Useful for client code operating on the cost values (e.g. difference between costs). - */ - operator double() const { - return connection_cost_; - } - - private: - PlacerTimingCosts* timing_costs_; - double& connection_cost_; - }; - - /** - * @brief Proxy class representing the connection costs of a net. - * - * Supports indexing by pin index to retrieve the ConnectionProxy for that pin/connection. - */ - class NetProxy { - public: - NetProxy(PlacerTimingCosts* timing_costs, double* net_sink_costs) - : timing_costs_(timing_costs) - , net_sink_costs_(net_sink_costs) {} - - ///@brief Indexes into the specific net pin/connection. - ConnectionProxy operator[](size_t ipin) { - return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]); - } - - const ConnectionProxy operator[](size_t ipin) const { - return ConnectionProxy(timing_costs_, net_sink_costs_[ipin]); - } - - private: - PlacerTimingCosts* timing_costs_; - double* net_sink_costs_; - }; - - ///@brief Indexes into the specific net. - NetProxy operator[](ClusterNetId net_id) { - VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0); - - double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]]; - return NetProxy(this, net_connection_costs); - } - - NetProxy operator[](ClusterNetId net_id) const { - VTR_ASSERT_SAFE(net_start_indicies_[net_id] >= 0); - - const double* net_connection_costs = &connection_costs_[net_start_indicies_[net_id]]; - return NetProxy(const_cast(this), const_cast(net_connection_costs)); - } - - void clear() { - connection_costs_.clear(); - net_start_indicies_.clear(); - } - - void swap(PlacerTimingCosts& other) { - std::swap(connection_costs_, other.connection_costs_); - std::swap(net_start_indicies_, other.net_start_indicies_); - std::swap(num_levels_, other.num_levels_); - } - - /** - * @brief Calculates the total cost of all connections efficiently - * in the face of modified connection costs. - */ - double total_cost() { - float cost = total_cost_recurr(0); //Root - - VTR_ASSERT_DEBUG_MSG(cost == total_cost_from_scratch(0), - "Expected incremental and from-scratch costs to be consistent"); - - return cost; - } - - private: - ///@brief Recursively calculate and update the timing cost rooted at inode. - double total_cost_recurr(size_t inode) { - //Prune out-of-tree - if (inode > connection_costs_.size() - 1) { - return 0.; - } - - //Valid pre-calculated intermediate result or valid leaf - if (!std::isnan(connection_costs_[inode])) { - return connection_costs_[inode]; - } - - //Recompute recursively - double node_cost = total_cost_recurr(left_child(inode)) - + total_cost_recurr(right_child(inode)); - - //Save intermedate cost at this node - connection_costs_[inode] = node_cost; - - return node_cost; - } - - double total_cost_from_scratch(size_t inode) const { - //Prune out-of-tree - if (inode > connection_costs_.size() - 1) { - return 0.; - } - - //Recompute recursively - double node_cost = total_cost_from_scratch(left_child(inode)) - + total_cost_from_scratch(right_child(inode)); - - return node_cost; - } - - ///@brief Friend-ed so it can call invalidate(). - friend ConnectionProxy; - - void invalidate(double* invalidated_cost) { - //Check pointer within range of internal storage - VTR_ASSERT_SAFE_MSG( - invalidated_cost >= &connection_costs_[0], - "Connection cost pointer should be after start of internal storage"); - - VTR_ASSERT_SAFE_MSG( - invalidated_cost <= &connection_costs_[connection_costs_.size() - 1], - "Connection cost pointer should be before end of internal storage"); - - size_t icost = invalidated_cost - &connection_costs_[0]; - - VTR_ASSERT_SAFE(icost >= num_nodes_up_to_level(num_levels_ - 2)); - - //Invalidate parent intermediate costs up to root or first - //already-invalidated parent - size_t iparent = parent(icost); - - while (!std::isnan(connection_costs_[iparent])) { - //Invalidate - connection_costs_[iparent] = std::numeric_limits::quiet_NaN(); - - if (iparent == 0) { - break; //At root - } else { - //Next parent - iparent = parent(iparent); - } - } - - VTR_ASSERT_SAFE_MSG(std::isnan(connection_costs_[0]), "Invalidating any connection should have invalidated the root"); - } - - size_t left_child(size_t i) const { - return 2 * i + 1; - } - - size_t right_child(size_t i) const { - return 2 * i + 2; - } - - size_t parent(size_t i) const { - return (i - 1) / 2; - } - - /** - * @brief Returns the number of nodes in ilevel'th level. - * - * If ilevel is negative, return 0, since the root shouldn't - * be counted as a leaf node candidate. - */ - size_t num_nodes_in_level(int ilevel) const { - return ilevel < 0 ? 0 : (2 << (ilevel)); - } - - ///@brief Returns the total number of nodes in levels [0..ilevel] (inclusive). - size_t num_nodes_up_to_level(int ilevel) const { - return (2 << (ilevel + 1)) - 1; - } - - private: - /** - * @brief Vector storing the implicit binary tree of connection costs. - * - * The actual connections are stored at the end of the vector - * (last level of the binary tree). The earlier portions of - * the tree are the intermediate nodes. - * - * The methods left_child()/right_child()/parent() can be used - * to traverse the tree by indicies into this vector. - */ - std::vector connection_costs_; - - /** - * @brief Vector storing the indicies of the first connection - * for each net in the netlist, used for indexing by net. - */ - vtr::vector net_start_indicies_; - - ///@brief Number of levels in the binary tree. - size_t num_levels_ = 0; -}; diff --git a/vpr/src/place/timing_place_lookup.cpp b/vpr/src/place/timing_place_lookup.cpp deleted file mode 100644 index 86dc396e2b8..00000000000 --- a/vpr/src/place/timing_place_lookup.cpp +++ /dev/null @@ -1,1319 +0,0 @@ - -#include -#include - -#include "rr_graph_fwd.h" -#include "vtr_assert.h" -#include "vtr_ndmatrix.h" -#include "vtr_log.h" -#include "vtr_util.h" -#include "vtr_math.h" -#include "vtr_memory.h" -#include "vtr_time.h" -#include "vtr_geometry.h" - -#include "arch_util.h" -#include "vpr_types.h" -#include "globals.h" -#include "place_and_route.h" -#include "route_net.h" -#include "timing_place_lookup.h" -#include "read_xml_arch_file.h" -#include "atom_netlist.h" - -// all functions in profiling:: namespace, which are only activated if PROFILE is defined -#include "route_profiling.h" -#include "router_delay_profiling.h" -#include "place_delay_model.h" - -/*To compute delay between blocks we calculate the delay between */ -/*different nodes in the FPGA. From this procedure we generate - * a lookup table which tells us the delay between different locations in*/ -/*the FPGA */ - -/*the delta arrays are used to contain the best case routing delay */ -/*between different locations on the FPGA. */ - -//#define VERBOSE - -constexpr float UNINITIALIZED_DELTA = -1; //Indicates the delta delay value has not been calculated -constexpr float EMPTY_DELTA = -2; //Indicates delta delay from/to an EMPTY block -constexpr float IMPOSSIBLE_DELTA = std::numeric_limits::infinity(); //Indicates there is no valid delta delay - -struct t_profile_loc { - t_profile_loc(int x, int y, std::vector> delta_values) - : root(x, y) - , deltas(delta_values) {} - - vtr::Point root; - std::vector> deltas; -}; - -struct t_profile_info { - std::vector locations; - - int max_delta_x; - int max_delta_y; -}; - -/*** Function Prototypes *****/ -static t_chan_width setup_chan_width(const t_router_opts& router_opts, - t_chan_width_dist chan_width_dist); - -static float route_connection_delay( - RouterDelayProfiler& route_profiler, - int from_layer_num, - int to_layer_num, - int source_x_loc, - int source_y_loc, - int sink_x_loc, - int sink_y_loc, - const t_router_opts& router_opts, - bool measure_directconnect); - -// Prototype for computing delta delay matrix. -typedef std::function>&, - int, - int, - int, - int, - int, - int, - int, - int, - const t_router_opts&, - bool, - const std::set&, - bool)> - t_compute_delta_delay_matrix; - -static void generic_compute_matrix_iterative_astar( - RouterDelayProfiler& route_profiler, - vtr::Matrix>& matrix, - int from_layer_num, - int to_layer_num, - int source_x, - int source_y, - int start_x, - int start_y, - int end_x, - int end_y, - const t_router_opts& router_opts, - bool measure_directconnect, - const std::set& allowed_types, - bool /***/); - -static void generic_compute_matrix_dijkstra_expansion( - RouterDelayProfiler& route_profiler, - vtr::Matrix>& matrix, - int from_layer_num, - int to_layer_num, - int source_x, - int source_y, - int start_x, - int start_y, - int end_x, - int end_y, - const t_router_opts& router_opts, - bool measure_directconnect, - const std::set& allowed_types, - bool is_flat); - -static vtr::NdMatrix compute_delta_delays( - RouterDelayProfiler& route_profiler, - const t_placer_opts& palcer_opts, - const t_router_opts& router_opts, - bool measure_directconnect, - size_t longest_length, - bool is_flat); - -float delay_reduce(std::vector& delays, e_reducer reducer); - -static vtr::NdMatrix compute_delta_delay_model( - RouterDelayProfiler& route_profiler, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - bool measure_directconnect, - int longest_length, - bool is_flat); - -/** - * @brief Use the information in the router lookahead to fill the delay matrix instead of running the router - * @param route_profiler - * @return The delay matrix that contain the minimum cost between two locations - */ -static vtr::NdMatrix compute_simple_delay_model(RouterDelayProfiler& route_profiler); - -static bool find_direct_connect_sample_locations(const t_direct_inf* direct, - t_physical_tile_type_ptr from_type, - int from_pin, - int from_pin_class, - t_physical_tile_type_ptr to_type, - int to_pin, - int to_pin_class, - RRNodeId& out_src_node, - RRNodeId& out_sink_node); - -static bool verify_delta_delays(const vtr::NdMatrix& delta_delays); - -static int get_longest_segment_length(std::vector& segment_inf); - -static void fix_empty_coordinates(vtr::NdMatrix& delta_delays); -static void fix_uninitialized_coordinates(vtr::NdMatrix& delta_delays); - -static float find_neighboring_average(vtr::NdMatrix& matrix, - int from_layer, - t_physical_tile_loc to_tile_loc, - int max_distance); - -/******* Globally Accessible Functions **********/ - -std::unique_ptr compute_place_delay_model(const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - const Netlist<>& net_list, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - t_chan_width_dist chan_width_dist, - const std::vector& directs, - bool is_flat) { - vtr::ScopedStartFinishTimer timer("Computing placement delta delay look-up"); - - t_chan_width chan_width = setup_chan_width(router_opts, chan_width_dist); - - alloc_routing_structs(chan_width, router_opts, det_routing_arch, segment_inf, directs, is_flat); - - const RouterLookahead* router_lookahead = get_cached_router_lookahead(*det_routing_arch, - router_opts.lookahead_type, - router_opts.write_router_lookahead, - router_opts.read_router_lookahead, - segment_inf, - is_flat); - - RouterDelayProfiler route_profiler(net_list, router_lookahead, is_flat); - - int longest_length = get_longest_segment_length(segment_inf); - - /*now setup and compute the actual arrays */ - std::unique_ptr place_delay_model; - float min_cross_layer_delay = get_min_cross_layer_delay(); - - if (placer_opts.delay_model_type == PlaceDelayModelType::SIMPLE) { - place_delay_model = std::make_unique(); - } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) { - place_delay_model = std::make_unique(min_cross_layer_delay, is_flat); - } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) { - place_delay_model = std::make_unique(min_cross_layer_delay, is_flat); - } else { - VTR_ASSERT_MSG(false, "Invalid placer delay model"); - } - - if (placer_opts.read_placement_delay_lookup.empty()) { - place_delay_model->compute(route_profiler, placer_opts, router_opts, longest_length); - } else { - place_delay_model->read(placer_opts.read_placement_delay_lookup); - } - - if (!placer_opts.write_placement_delay_lookup.empty()) { - place_delay_model->write(placer_opts.write_placement_delay_lookup); - } - - /*free all data structures that are no longer needed */ - free_routing_structs(); - - return place_delay_model; -} - -void DeltaDelayModel::compute( - RouterDelayProfiler& route_profiler, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - int longest_length) { - delays_ = compute_delta_delay_model( - route_profiler, - placer_opts, router_opts, /*measure_directconnect=*/true, - longest_length, - is_flat_); -} - -void OverrideDelayModel::compute( - RouterDelayProfiler& route_profiler, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - int longest_length) { - auto delays = compute_delta_delay_model( - route_profiler, - placer_opts, router_opts, /*measure_directconnect=*/false, - longest_length, - is_flat_); - - base_delay_model_ = std::make_unique(cross_layer_delay_, delays, false); - - compute_override_delay_model(route_profiler, router_opts); -} - -void SimpleDelayModel::compute( - RouterDelayProfiler& router, - const t_placer_opts& /*placer_opts*/, - const t_router_opts& /*router_opts*/, - int /*longest_length*/) { - delays_ = compute_simple_delay_model(router); -} - -/******* File Accessible Functions **********/ - -std::vector get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) { - /* - * This function tries to identify the best pin classes to hook up - * for delay calculation. The assumption is that we should pick - * the pin class with the largest number of pins. This makes - * sense, since it ensures we pick commonly used pins, and - * removes order dependence on how the pins are specified - * in the architecture (except in the case were the two largest pin classes - * of a particular pintype have the same number of pins, in which case the - * first pin class is used). - */ - - std::vector best_classes; - - //Record any non-zero Fc pins - // - //Note that we track non-zero Fc pins, since certain Fc overides - //may apply to only a subset of wire types. This ensures we record - //which pins can potentially connect to global routing. - std::unordered_set non_zero_fc_pins; - for (const t_fc_specification& fc_spec : type->fc_specs) { - if (fc_spec.fc_value == 0) continue; - - non_zero_fc_pins.insert(fc_spec.pins.begin(), fc_spec.pins.end()); - } - - //Collect all classes of matching type which connect to general routing - for (int i = 0; i < (int)type->class_inf.size(); i++) { - if (type->class_inf[i].type == pintype) { - //Check whether all pins in this class are ignored or have zero fc - bool any_pins_connect_to_general_routing = false; - for (int ipin = 0; ipin < type->class_inf[i].num_pins; ++ipin) { - int pin = type->class_inf[i].pinlist[ipin]; - //If the pin isn't ignored, and has a non-zero Fc to some general - //routing the class is suitable for delay profiling - if (!type->is_ignored_pin[pin] && non_zero_fc_pins.count(pin)) { - any_pins_connect_to_general_routing = true; - break; - } - } - - if (!any_pins_connect_to_general_routing) continue; //Skip if doesn't connect to general routing - - //Record candidate class - best_classes.push_back(i); - } - } - - //Sort classe so largest pin class is first - auto cmp_class = [&](int lhs, int rhs) { - return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins; - }; - - std::stable_sort(best_classes.begin(), best_classes.end(), cmp_class); - - return best_classes; -} - -static int get_longest_segment_length(std::vector& segment_inf) { - int length = 0; - - for (const t_segment_inf &seg_info : segment_inf) { - if (seg_info.length > length) { - length = seg_info.length; - } - } - - return length; -} - -static t_chan_width setup_chan_width(const t_router_opts& router_opts, - t_chan_width_dist chan_width_dist) { - /*we give plenty of tracks, this increases routability for the */ - /*lookup table generation */ - - t_graph_type graph_directionality; - int width_fac; - - if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) { - auto& device_ctx = g_vpr_ctx.device(); - - auto type = find_most_common_tile_type(device_ctx.grid); - - width_fac = 4 * type->num_pins; - /*this is 2x the value that binary search starts */ - /*this should be enough to allow most pins to */ - /*connect to tracks in the architecture */ - } else { - width_fac = router_opts.fixed_channel_width; - } - - if (router_opts.route_type == GLOBAL) { - graph_directionality = GRAPH_BIDIR; - } else { - graph_directionality = GRAPH_UNIDIR; - } - - return init_chan(width_fac, chan_width_dist, graph_directionality); -} - -static float route_connection_delay( - RouterDelayProfiler& route_profiler, - int from_layer_num, - int to_layer_num, - int source_x, - int source_y, - int sink_x, - int sink_y, - const t_router_opts& router_opts, - bool measure_directconnect) { - //Routes between the source and sink locations and calculates the delay - - float net_delay_value = IMPOSSIBLE_DELTA; /*set to known value for debug purposes */ - - auto& device_ctx = g_vpr_ctx.device(); - - bool successfully_routed = false; - - //Get the rr nodes to route between - auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num})); - auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num})); - - for (int driver_ptc : best_driver_ptcs) { - VTR_ASSERT(driver_ptc != OPEN); - RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc); - - VTR_ASSERT(source_rr_node != RRNodeId::INVALID()); - - for (int sink_ptc : best_sink_ptcs) { - VTR_ASSERT(sink_ptc != OPEN); - RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc); - - if (sink_rr_node == RRNodeId::INVALID()) - continue; - - if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) { - //Skip if we shouldn't measure direct connects and a direct connect exists - continue; - } - - { - successfully_routed = route_profiler.calculate_delay( - source_rr_node, sink_rr_node, - router_opts, - &net_delay_value); - } - - if (successfully_routed) break; - } - if (successfully_routed) break; - } - - if (!successfully_routed) { - VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n", - source_x, source_y, from_layer_num, sink_x, sink_y, to_layer_num, net_delay_value); - } - - return (net_delay_value); -} - -static void add_delay_to_matrix( - vtr::Matrix>* matrix, - int delta_x, - int delta_y, - float delay) { - if ((*matrix)[delta_x][delta_y].size() == 1 && (*matrix)[delta_x][delta_y][0] == EMPTY_DELTA) { - //Overwrite empty delta - (*matrix)[delta_x][delta_y][0] = delay; - } else { - //Collect delta - (*matrix)[delta_x][delta_y].push_back(delay); - } -} - -static void generic_compute_matrix_dijkstra_expansion( - RouterDelayProfiler& /*route_profiler*/, - vtr::Matrix>& matrix, - int from_layer_num, - int to_layer_num, - int source_x, - int source_y, - int start_x, - int start_y, - int end_x, - int end_y, - const t_router_opts& router_opts, - bool measure_directconnect, - const std::set& allowed_types, - bool is_flat) { - auto& device_ctx = g_vpr_ctx.device(); - - t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}); - bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end(); - if (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE || !is_allowed_type) { - for (int sink_x = start_x; sink_x <= end_x; sink_x++) { - for (int sink_y = start_y; sink_y <= end_y; sink_y++) { - int delta_x = abs(sink_x - source_x); - int delta_y = abs(sink_y - source_y); - - if (matrix[delta_x][delta_y].empty()) { - //Only set empty target if we don't already have a valid delta delay - matrix[delta_x][delta_y].push_back(EMPTY_DELTA); -#ifdef VERBOSE - VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n", - "EMPTY", - delta_x, delta_y, - source_x, source_y, - sink_x, sink_y); -#endif - } - } - } - - return; - } - - vtr::Matrix found_matrix({matrix.dim_size(0), matrix.dim_size(1)}, false); - - auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num})); - for (int driver_ptc : best_driver_ptcs) { - VTR_ASSERT(driver_ptc != OPEN); - RRNodeId source_rr_node = device_ctx.rr_graph.node_lookup().find_node(from_layer_num, source_x, source_y, SOURCE, driver_ptc); - - VTR_ASSERT(source_rr_node != RRNodeId::INVALID()); - auto delays = calculate_all_path_delays_from_rr_node(source_rr_node, router_opts, is_flat); - - bool path_to_all_sinks = true; - for (int sink_x = start_x; sink_x <= end_x; sink_x++) { - for (int sink_y = start_y; sink_y <= end_y; sink_y++) { - int delta_x = abs(sink_x - source_x); - int delta_y = abs(sink_y - source_y); - - if (found_matrix[delta_x][delta_y]) { - continue; - } - - t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}); - if (sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE) { - if (matrix[delta_x][delta_y].empty()) { - //Only set empty target if we don't already have a valid delta delay - matrix[delta_x][delta_y].push_back(EMPTY_DELTA); -#ifdef VERBOSE - VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n", - "EMPTY", - delta_x, delta_y, - source_x, source_y, - sink_x, sink_y); -#endif - found_matrix[delta_x][delta_y] = true; - } - } else { - bool found_a_sink = false; - auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num})); - for (int sink_ptc : best_sink_ptcs) { - VTR_ASSERT(sink_ptc != OPEN); - RRNodeId sink_rr_node = device_ctx.rr_graph.node_lookup().find_node(to_layer_num, sink_x, sink_y, SINK, sink_ptc); - - if (sink_rr_node == RRNodeId::INVALID()) - continue; - - if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) { - //Skip if we shouldn't measure direct connects and a direct connect exists - continue; - } - - if (std::isnan(delays[sink_rr_node])) { - // This sink was not found - continue; - } - -#ifdef VERBOSE - VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n", - delays[size_t(sink_rr_node)], - delta_x, delta_y, - source_x, source_y, - sink_x, sink_y); -#endif - found_matrix[delta_x][delta_y] = true; - - add_delay_to_matrix(&matrix, delta_x, delta_y, delays[sink_rr_node]); - - found_a_sink = true; - break; - } - - if (!found_a_sink) { - path_to_all_sinks = false; - } - } - } - } - - if (path_to_all_sinks) { - break; - } - } - - for (int sink_x = start_x; sink_x <= end_x; sink_x++) { - for (int sink_y = start_y; sink_y <= end_y; sink_y++) { - int delta_x = abs(sink_x - source_x); - int delta_y = abs(sink_y - source_y); - if (!found_matrix[delta_x][delta_y]) { - add_delay_to_matrix(&matrix, delta_x, delta_y, IMPOSSIBLE_DELTA); - VTR_LOG_WARN("Unable to route between blocks at (%d,%d,%d) and (%d,%d,%d) to characterize delay (setting to %g)\n", - source_x, - source_y, - from_layer_num, - sink_x, - sink_y, - to_layer_num, - IMPOSSIBLE_DELTA); - } - } - } -} - -static void generic_compute_matrix_iterative_astar( - RouterDelayProfiler& route_profiler, - vtr::Matrix>& matrix, - int from_layer_num, - int to_layer_num, - int source_x, - int source_y, - int start_x, - int start_y, - int end_x, - int end_y, - const t_router_opts& router_opts, - bool measure_directconnect, - const std::set& allowed_types, - bool /***/) { - //vtr::ScopedStartFinishTimer t(vtr::string_fmt("Profiling from (%d,%d)", source_x, source_y)); - - int delta_x, delta_y; - int sink_x, sink_y; - - auto& device_ctx = g_vpr_ctx.device(); - - for (sink_x = start_x; sink_x <= end_x; sink_x++) { - for (sink_y = start_y; sink_y <= end_y; sink_y++) { - delta_x = abs(sink_x - source_x); - delta_y = abs(sink_y - source_y); - - t_physical_tile_type_ptr src_type = device_ctx.grid.get_physical_type({source_x, source_y, from_layer_num}); - t_physical_tile_type_ptr sink_type = device_ctx.grid.get_physical_type({sink_x, sink_y, to_layer_num}); - - bool src_or_target_empty = (src_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE - || sink_type == device_ctx.EMPTY_PHYSICAL_TILE_TYPE); - - bool is_allowed_type = allowed_types.empty() || allowed_types.find(src_type->name) != allowed_types.end(); - - if (src_or_target_empty || !is_allowed_type) { - if (matrix[delta_x][delta_y].empty()) { - //Only set empty target if we don't already have a valid delta delay - matrix[delta_x][delta_y].push_back(EMPTY_DELTA); -#ifdef VERBOSE - VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n", - "EMPTY", - delta_x, delta_y, - source_x, source_y, - sink_x, sink_y); -#endif - } - } else { - //Valid start/end - - float delay = route_connection_delay(route_profiler, - from_layer_num, - to_layer_num, - source_x, - source_y, - sink_x, - sink_y, - router_opts, - measure_directconnect); - -#ifdef VERBOSE - VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n", - delay, - delta_x, delta_y, - source_x, source_y, - sink_x, sink_y); -#endif - if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) { - //Overwrite empty delta - matrix[delta_x][delta_y][0] = delay; - } else { - //Collect delta - matrix[delta_x][delta_y].push_back(delay); - } - } - } - } -} - -static vtr::NdMatrix compute_delta_delays( - RouterDelayProfiler& route_profiler, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - bool measure_directconnect, - size_t longest_length, - bool is_flat) { - //To avoid edge effects we place the source at least 'longest_length' away - //from the device edge - //and route from there for all possible delta values < dimension - - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - - vtr::NdMatrix delta_delays({static_cast(grid.get_num_layers()), static_cast(grid.get_num_layers()), grid.width(), grid.height()}); - - for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); from_layer_num++) { - for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); to_layer_num++) { - vtr::NdMatrix, 2> sampled_delta_delays({grid.width(), grid.height()}); - - size_t mid_x = vtr::nint(grid.width() / 2); - size_t mid_y = vtr::nint(grid.height() / 2); - - size_t low_x = std::min(longest_length, mid_x); - size_t low_y = std::min(longest_length, mid_y); - size_t high_x = mid_x; - size_t high_y = mid_y; - if (longest_length <= grid.width()) { - high_x = std::max(grid.width() - longest_length, mid_x); - } - if (longest_length <= grid.height()) { - high_y = std::max(grid.height() - longest_length, mid_y); - } - - std::set allowed_types; - if (!placer_opts.allowed_tiles_for_delay_model.empty()) { - auto allowed_types_vector = vtr::split(placer_opts.allowed_tiles_for_delay_model, ","); - for (const auto& type : allowed_types_vector) { - allowed_types.insert(type); - } - } - - // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - // + | | + - // + A | B | C + - // + | | + - // +-----------------\-----------------------.---------------+ - // + | | + - // + | | + - // + | | + - // + | | + - // + D | E | F + - // + | | + - // + | | + - // + | | + - // + | | + - // +-----------------*-----------------------/---------------+ - // + | | + - // + G | H | I + - // + | | + - // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - // - // * = (low_x, low_y) - // . = (high_x, high_y) - // / = (high_x, low_y) - // \ = (low_x, high_y) - // + = device edge - - //Find the lowest y location on the left edge with a non-empty block - int y = 0; - int x = 0; - t_physical_tile_type_ptr src_type = nullptr; - for (x = 0; x < (int)grid.width(); ++x) { - for (y = 0; y < (int)grid.height(); ++y) { - auto type = grid.get_physical_type({x, y, from_layer_num}); - - if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) { - if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) { - continue; - } - src_type = type; - break; - } - } - if (src_type) { - break; - } - } - VTR_ASSERT(src_type != nullptr); - - t_compute_delta_delay_matrix generic_compute_matrix; - switch (placer_opts.place_delta_delay_matrix_calculation_method) { - case e_place_delta_delay_algorithm::ASTAR_ROUTE: - generic_compute_matrix = generic_compute_matrix_iterative_astar; - break; - case e_place_delta_delay_algorithm::DIJKSTRA_EXPANSION: - generic_compute_matrix = generic_compute_matrix_dijkstra_expansion; - break; - default: - VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unknown place_delta_delay_matrix_calculation_method %d", placer_opts.place_delta_delay_matrix_calculation_method); - } - -#ifdef VERBOSE - VTR_LOG("Computing from lower left edge (%d,%d):\n", x, y); -#endif - generic_compute_matrix(route_profiler, sampled_delta_delays, - from_layer_num, to_layer_num, - x, y, - x, y, - grid.width() - 1, grid.height() - 1, - router_opts, - measure_directconnect, allowed_types, - is_flat); - - //Find the lowest x location on the bottom edge with a non-empty block - src_type = nullptr; - for (y = 0; y < (int)grid.height(); ++y) { - for (x = 0; x < (int)grid.width(); ++x) { - auto type = grid.get_physical_type({x, y, from_layer_num}); - - if (type != device_ctx.EMPTY_PHYSICAL_TILE_TYPE) { - if (!allowed_types.empty() && allowed_types.find(type->name) == allowed_types.end()) { - continue; - } - src_type = type; - break; - } - } - if (src_type) { - break; - } - } - VTR_ASSERT(src_type != nullptr); -#ifdef VERBOSE - VTR_LOG("Computing from left bottom edge (%d,%d):\n", x, y); -#endif - generic_compute_matrix(route_profiler, sampled_delta_delays, - from_layer_num, to_layer_num, - x, y, - x, y, - grid.width() - 1, grid.height() - 1, - router_opts, - measure_directconnect, allowed_types, - is_flat); - - //Since the other delta delay values may have suffered from edge effects, - //we recalculate deltas within regions B, C, E, F -#ifdef VERBOSE - VTR_LOG("Computing from low/low:\n"); -#endif - generic_compute_matrix(route_profiler, sampled_delta_delays, - from_layer_num, to_layer_num, - low_x, low_y, - low_x, low_y, - grid.width() - 1, grid.height() - 1, - router_opts, - measure_directconnect, allowed_types, - is_flat); - - //Since the other delta delay values may have suffered from edge effects, - //we recalculate deltas within regions D, E, G, H -#ifdef VERBOSE - VTR_LOG("Computing from high/high:\n"); -#endif - generic_compute_matrix(route_profiler, sampled_delta_delays, - from_layer_num, to_layer_num, - high_x, high_y, - 0, 0, - high_x, high_y, - router_opts, - measure_directconnect, allowed_types, - is_flat); - - //Since the other delta delay values may have suffered from edge effects, - //we recalculate deltas within regions A, B, D, E -#ifdef VERBOSE - VTR_LOG("Computing from high/low:\n"); -#endif - generic_compute_matrix(route_profiler, sampled_delta_delays, - from_layer_num, to_layer_num, - high_x, low_y, - 0, low_y, - high_x, grid.height() - 1, - router_opts, - measure_directconnect, allowed_types, - is_flat); - - //Since the other delta delay values may have suffered from edge effects, - //we recalculate deltas within regions E, F, H, I -#ifdef VERBOSE - VTR_LOG("Computing from low/high:\n"); -#endif - generic_compute_matrix(route_profiler, sampled_delta_delays, - from_layer_num, to_layer_num, - low_x, high_y, - low_x, 0, - grid.width() - 1, high_y, - router_opts, - measure_directconnect, allowed_types, - is_flat); - for (size_t dx = 0; dx < sampled_delta_delays.dim_size(0); ++dx) { - for (size_t dy = 0; dy < sampled_delta_delays.dim_size(1); ++dy) { - delta_delays[from_layer_num][to_layer_num][dx][dy] = delay_reduce(sampled_delta_delays[dx][dy], placer_opts.delay_model_reducer); - } - } - } - } - - return delta_delays; -} - -float delay_reduce(std::vector& delays, e_reducer reducer) { - if (delays.empty()) { - return IMPOSSIBLE_DELTA; - } else if (delays.size() == 1) { - return delays[0]; - } - - VTR_ASSERT(delays.size() > 1); - - float delay; - - if (reducer == e_reducer::MIN) { - auto itr = std::min_element(delays.begin(), delays.end()); - delay = *itr; - } else if (reducer == e_reducer::MAX) { - auto itr = std::max_element(delays.begin(), delays.end()); - delay = *itr; - } else if (reducer == e_reducer::MEDIAN) { - std::stable_sort(delays.begin(), delays.end()); - delay = vtr::median(delays.begin(), delays.end()); - } else if (reducer == e_reducer::ARITHMEAN) { - delay = vtr::arithmean(delays.begin(), delays.end()); - } else if (reducer == e_reducer::GEOMEAN) { - delay = vtr::geomean(delays.begin(), delays.end()); - } else { - VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unrecognized delta delay reducer"); - } - - return delay; -} - -/* We return the average placement estimated delay for a routing spanning (x,y). - * We start with an averaging distance of 1 (i.e. from (x-1,y-1) to (x+1,y+1)) - * and look for legal delay values to average; if some are found we return the - * average and if none are found we increase the distance to average over. - * - * If no legal values are found to average over with a range of max_distance, - * we return IMPOSSIBLE_DELTA. - */ -static float find_neighboring_average( - vtr::NdMatrix& matrix, - int from_layer, - t_physical_tile_loc to_tile_loc, - int max_distance) { - float sum = 0; - int counter = 0; - int endx = matrix.end_index(2); - int endy = matrix.end_index(3); - - int x = to_tile_loc.x; - int y = to_tile_loc.y; - int to_layer = to_tile_loc.layer_num; - - for (int distance = 1; distance <= max_distance; ++distance) { - for (int delx = x - distance; delx <= x + distance; delx++) { - for (int dely = y - distance; dely <= y + distance; dely++) { - // Check distance constraint - if (abs(delx - x) + abs(dely - y) > distance) { - continue; - } - - //check out of bounds - if (delx < 0 || dely < 0 || delx >= endx || dely >= endy || (delx == x && dely == y)) { - continue; - } - - if (matrix[from_layer][to_layer][delx][dely] == EMPTY_DELTA || matrix[from_layer][to_layer][delx][dely] == IMPOSSIBLE_DELTA) { - continue; - } - counter++; - sum += matrix[from_layer][to_layer][delx][dely]; - } - } - if (counter != 0) { - return sum / (float)counter; - } - } - - return IMPOSSIBLE_DELTA; -} - -static void fix_empty_coordinates(vtr::NdMatrix& delta_delays) { - // Set any empty delta's to the average of it's neighbours - // - // Empty coordinates may occur if the sampling location happens to not have - // a connection at that location. However a more through sampling likely - // would return a result, so we fill in the empty holes with a small - // neighbour average. - constexpr int kMaxAverageDistance = 2; - for (int from_layer = 0; from_layer < (int)delta_delays.dim_size(0); ++from_layer) { - for (int to_layer = 0; to_layer < (int)delta_delays.dim_size(1); ++to_layer) { - for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) { - for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) { - if (delta_delays[from_layer][to_layer][delta_x][delta_y] == EMPTY_DELTA) { - delta_delays[from_layer][to_layer][delta_x][delta_y] = - find_neighboring_average(delta_delays, - from_layer, - {delta_x, delta_y, to_layer}, - kMaxAverageDistance); - } - } - } - } - } -} - -static void fix_uninitialized_coordinates(vtr::NdMatrix& delta_delays) { - // Set any empty delta's to the average of it's neighbours - - for (size_t from_layer_num = 0; from_layer_num < delta_delays.dim_size(0); ++from_layer_num) { - for (size_t to_layer_num = 0; to_layer_num < delta_delays.dim_size(1); ++to_layer_num) { - for (size_t delta_x = 0; delta_x < delta_delays.dim_size(2); ++delta_x) { - for (size_t delta_y = 0; delta_y < delta_delays.dim_size(3); ++delta_y) { - if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == UNINITIALIZED_DELTA) { - delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = IMPOSSIBLE_DELTA; - } - } - } - } - } -} - -static void fill_impossible_coordinates(vtr::NdMatrix& delta_delays) { - // Set any impossible delta's to the average of its neighbours - // - // Impossible coordinates may occur if an IPIN cannot be reached from the - // sampling OPIN. This might occur if the IPIN or OPIN used for sampling - // is specialized, and therefore cannot be reached via the by the pins - // sampled. Leaving this value in the delay matrix will result in invalid - // slacks if the delay matrix uses this value. - // - // A max average distance of 5 is used to provide increased effort in - // filling these gaps. It is more important to have a poor predication, - // than an invalid value and causing a slack assertion. - constexpr int kMaxAverageDistance = 5; - for (int from_layer_num = 0; from_layer_num < (int)delta_delays.dim_size(0); ++from_layer_num) { - for (int to_layer_num = 0; to_layer_num < (int)delta_delays.dim_size(1); ++to_layer_num) { - for (int delta_x = 0; delta_x < (int)delta_delays.dim_size(2); ++delta_x) { - for (int delta_y = 0; delta_y < (int)delta_delays.dim_size(3); ++delta_y) { - if (delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] == IMPOSSIBLE_DELTA) { - delta_delays[from_layer_num][to_layer_num][delta_x][delta_y] = find_neighboring_average( - delta_delays, from_layer_num, {delta_x, delta_y, to_layer_num}, kMaxAverageDistance); - } - } - } - } - } -} - -static vtr::NdMatrix compute_delta_delay_model( - RouterDelayProfiler& route_profiler, - const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - bool measure_directconnect, - int longest_length, - bool is_flat) { - vtr::ScopedStartFinishTimer timer("Computing delta delays"); - vtr::NdMatrix delta_delays = compute_delta_delays(route_profiler, - placer_opts, - router_opts, - measure_directconnect, - longest_length, - is_flat); - - fix_uninitialized_coordinates(delta_delays); - - fix_empty_coordinates(delta_delays); - - fill_impossible_coordinates(delta_delays); - - verify_delta_delays(delta_delays); - - return delta_delays; -} - -static vtr::NdMatrix compute_simple_delay_model(RouterDelayProfiler& route_profiler) { - const auto& grid = g_vpr_ctx.device().grid; - int num_physical_tile_types = static_cast(g_vpr_ctx.device().physical_tile_types.size()); - // Initializing the delay matrix to [num_physical_types][num_layers][num_layers][width][height] - // The second index related to the layer that the source location is on and the third index is for the sink layer - vtr::NdMatrix delta_delays({static_cast(num_physical_tile_types), - static_cast(grid.get_num_layers()), - static_cast(grid.get_num_layers()), - grid.width(), - grid.height()}); - - for (int physical_tile_type_idx = 0; physical_tile_type_idx < num_physical_tile_types; ++physical_tile_type_idx) { - for (int from_layer = 0; from_layer < grid.get_num_layers(); ++from_layer) { - for (int to_layer = 0; to_layer < grid.get_num_layers(); ++to_layer) { - for (int dx = 0; dx < static_cast(grid.width()); ++dx) { - for (int dy = 0; dy < static_cast(grid.height()); ++dy) { - float min_delay = route_profiler.get_min_delay(physical_tile_type_idx, - from_layer, - to_layer, - dx, - dy); - delta_delays[physical_tile_type_idx][from_layer][to_layer][dx][dy] = min_delay; - } - } - } - } - } - - return delta_delays; -} - -//Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification -static bool find_direct_connect_sample_locations(const t_direct_inf* direct, - t_physical_tile_type_ptr from_type, - int from_pin, - int from_pin_class, - t_physical_tile_type_ptr to_type, - int to_pin, - int to_pin_class, - RRNodeId& out_src_node, - RRNodeId& out_sink_node) { - VTR_ASSERT(from_type != nullptr); - VTR_ASSERT(to_type != nullptr); - - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - const auto& node_lookup = device_ctx.rr_graph.node_lookup(); - - //Search the grid for an instance of from/to blocks which satisfy this direct connect offsets, - //and which has the appropriate pins - int from_x = -1; - int from_y = -1; - int from_sub_tile = -1; - int to_x = 0, to_y = 0, to_sub_tile = 0; - bool found = false; - int found_layer_num = -1; - //TODO: Function *FOR NOW* assumes that from/to blocks are at same die and have a same layer nums - for (int layer_num = 0; layer_num < grid.get_num_layers() && !found; ++layer_num) { - for (int x = 0; x < (int)grid.width() && !found; ++x) { - to_x = x + direct->x_offset; - if (to_x < 0 || to_x >= (int)grid.width()) continue; - - for (int y = 0; y < (int)grid.height() && !found; ++y) { - if (grid.get_physical_type({x, y, layer_num}) != from_type) continue; - - //Check that the from pin exists at this from location - //(with multi-width/height blocks pins may not exist at all locations) - bool from_pin_found = false; - if (direct->from_side != NUM_2D_SIDES) { - RRNodeId from_pin_rr = node_lookup.find_node(layer_num, x, y, OPIN, from_pin, direct->from_side); - from_pin_found = from_pin_rr.is_valid(); - } else { - from_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, x, y, OPIN, from_pin).empty()); - } - if (!from_pin_found) continue; - - to_y = y + direct->y_offset; - - if (to_y < 0 || to_y >= (int)grid.height()) continue; - if (grid.get_physical_type({to_x, to_y, layer_num}) != to_type) continue; - - //Check that the from pin exists at this from location - //(with multi-width/height blocks pins may not exist at all locations) - bool to_pin_found = false; - if (direct->to_side != NUM_2D_SIDES) { - RRNodeId to_pin_rr = node_lookup.find_node(layer_num, to_x, to_y, IPIN, to_pin, direct->to_side); - to_pin_found = (to_pin_rr != RRNodeId::INVALID()); - } else { - to_pin_found = !(node_lookup.find_nodes_at_all_sides(layer_num, to_x, to_y, IPIN, to_pin).empty()); - } - if (!to_pin_found) continue; - - for (int sub_tile_num = 0; sub_tile_num < from_type->capacity; ++sub_tile_num) { - to_sub_tile = sub_tile_num + direct->sub_tile_offset; - - if (to_sub_tile < 0 || to_sub_tile >= to_type->capacity) continue; - - found = true; - found_layer_num = layer_num; - from_x = x; - from_y = y; - from_sub_tile = sub_tile_num; - - break; - } - } - } - } - - if (!found) { - return false; - } - - //Now have a legal instance of this direct connect - VTR_ASSERT(grid.get_physical_type({from_x, from_y, found_layer_num}) == from_type); - VTR_ASSERT(from_sub_tile < from_type->capacity); - - VTR_ASSERT(grid.get_physical_type({to_x, to_y, found_layer_num}) == to_type); - VTR_ASSERT(to_sub_tile < to_type->capacity); - - VTR_ASSERT(from_x + direct->x_offset == to_x); - VTR_ASSERT(from_y + direct->y_offset == to_y); - VTR_ASSERT(from_sub_tile + direct->sub_tile_offset == to_sub_tile); - - // - //Find a source/sink RR node associated with the pins of the direct - // - - { - RRNodeId src_rr_candidate = node_lookup.find_node(found_layer_num, from_x, from_y, SOURCE, from_pin_class); - VTR_ASSERT(src_rr_candidate); - out_src_node = src_rr_candidate; - } - - { - RRNodeId sink_rr_candidate = node_lookup.find_node(found_layer_num, to_x, to_y, SINK, to_pin_class); - VTR_ASSERT(sink_rr_candidate); - out_sink_node = sink_rr_candidate; - } - - return true; -} - -static bool verify_delta_delays(const vtr::NdMatrix& delta_delays) { - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - - for (int from_layer_num = 0; from_layer_num < grid.get_num_layers(); ++from_layer_num) { - for (int to_layer_num = 0; to_layer_num < grid.get_num_layers(); ++to_layer_num) { - for (size_t x = 0; x < grid.width(); ++x) { - for (size_t y = 0; y < grid.height(); ++y) { - float delta_delay = delta_delays[from_layer_num][to_layer_num][x][y]; - - if (delta_delay < 0.) { - VPR_ERROR(VPR_ERROR_PLACE, - "Found invaild negative delay %g for delta [%d,%d,%d,%d]", - delta_delay, from_layer_num, to_layer_num, x, y); - } - } - } - } - } - - return true; -} - -void OverrideDelayModel::compute_override_delay_model( - RouterDelayProfiler& route_profiler, - const t_router_opts& router_opts) { - t_router_opts router_opts2 = router_opts; - router_opts2.astar_fac = 0.f; - router_opts2.astar_offset = 0.f; - - //Look at all the direct connections that exist, and add overrides to delay model - auto& device_ctx = g_vpr_ctx.device(); - for (int idirect = 0; idirect < (int)device_ctx.arch->directs.size(); ++idirect) { - const t_direct_inf* direct = &device_ctx.arch->directs[idirect]; - - InstPort from_port = parse_inst_port(direct->from_pin); - InstPort to_port = parse_inst_port(direct->to_pin); - - t_physical_tile_type_ptr from_type = find_tile_type_by_name(from_port.instance_name(), device_ctx.physical_tile_types); - t_physical_tile_type_ptr to_type = find_tile_type_by_name(to_port.instance_name(), device_ctx.physical_tile_types); - - int num_conns = from_port.port_high_index() - from_port.port_low_index() + 1; - VTR_ASSERT_MSG(num_conns == to_port.port_high_index() - to_port.port_low_index() + 1, "Directs must have the same size to/from"); - - //We now walk through all the connections associated with the current direct specification, measure - //their delay and specify that value as an override in the delay model. - // - //Note that we need to check every connection in the direct to cover the case where the pins are not - //equivalent. - // - //However, if the from/to ports are equivalent we could end up sampling the same RR SOURCE/SINK - //paths multiple times (wasting CPU time) -- we avoid this by recording the sampled paths in - //sampled_rr_pairs and skipping them if they occur multiple times. - int missing_instances = 0; - int missing_paths = 0; - std::set> sampled_rr_pairs; - for (int iconn = 0; iconn < num_conns; ++iconn) { - //Find the associated pins - int from_pin = find_pin(from_type, from_port.port_name(), from_port.port_low_index() + iconn); - int to_pin = find_pin(to_type, to_port.port_name(), to_port.port_low_index() + iconn); - - VTR_ASSERT(from_pin != OPEN); - VTR_ASSERT(to_pin != OPEN); - - int from_pin_class = find_pin_class(from_type, from_port.port_name(), from_port.port_low_index() + iconn, DRIVER); - VTR_ASSERT(from_pin_class != OPEN); - - int to_pin_class = find_pin_class(to_type, to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER); - VTR_ASSERT(to_pin_class != OPEN); - - bool found_sample_points; - RRNodeId src_rr, sink_rr; - found_sample_points = find_direct_connect_sample_locations(direct, from_type, from_pin, from_pin_class, to_type, to_pin, to_pin_class, src_rr, sink_rr); - - if (!found_sample_points) { - ++missing_instances; - continue; - } - - //If some of the source/sink ports are logically equivalent we may have already - //sampled the associated source/sink pair and don't need to do so again - if (sampled_rr_pairs.count({src_rr, sink_rr})) continue; - - float direct_connect_delay = std::numeric_limits::quiet_NaN(); - bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay); - - if (found_routing_path) { - set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay); - } else { - ++missing_paths; - } - - //Record that we've sampled this pair of source and sink nodes - sampled_rr_pairs.insert({src_rr, sink_rr}); - } - - VTR_LOGV_WARN(missing_instances > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no instances of this direct found)\n", missing_instances, direct->name.c_str()); - VTR_LOGV_WARN(missing_paths > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no routing path found)\n", missing_paths, direct->name.c_str()); - } -} - -bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node) { - //Returns true if there is a directconnect between the two RR nodes - // - //This is checked by looking for a SOURCE -> OPIN -> IPIN -> SINK path - //which starts at src_rr_node and ends at sink_rr_node - auto& device_ctx = g_vpr_ctx.device(); - const auto& rr_graph = device_ctx.rr_graph; - - VTR_ASSERT(rr_graph.node_type(src_rr_node) == SOURCE && rr_graph.node_type(sink_rr_node) == SINK); - - //TODO: This is a constant depth search, but still may be too slow - for (t_edge_size i_src_edge = 0; i_src_edge < rr_graph.num_edges(src_rr_node); ++i_src_edge) { - RRNodeId opin_rr_node = rr_graph.edge_sink_node(src_rr_node, i_src_edge); - - if (rr_graph.node_type(opin_rr_node) != OPIN) continue; - - for (t_edge_size i_opin_edge = 0; i_opin_edge < rr_graph.num_edges(opin_rr_node); ++i_opin_edge) { - RRNodeId ipin_rr_node = rr_graph.edge_sink_node(opin_rr_node, i_opin_edge); - if (rr_graph.node_type(ipin_rr_node) != IPIN) continue; - - for (t_edge_size i_ipin_edge = 0; i_ipin_edge < rr_graph.num_edges(ipin_rr_node); ++i_ipin_edge) { - if (sink_rr_node == rr_graph.edge_sink_node(ipin_rr_node, i_ipin_edge)) { - return true; - } - } - } - } - return false; -} diff --git a/vpr/src/place/timing_place_lookup.h b/vpr/src/place/timing_place_lookup.h deleted file mode 100644 index fba3f470483..00000000000 --- a/vpr/src/place/timing_place_lookup.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef TIMING_PLACE_LOOKUP_H -#define TIMING_PLACE_LOOKUP_H -#include "place_delay_model.h" - -std::unique_ptr compute_place_delay_model(const t_placer_opts& placer_opts, - const t_router_opts& router_opts, - const Netlist<>& net_list, - t_det_routing_arch* det_routing_arch, - std::vector& segment_inf, - t_chan_width_dist chan_width_dist, - const std::vector& directs, - bool is_flat); - -std::vector get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type); - -bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node); - -#endif diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp index 5feb0e9b2f6..f9c4c1d74a8 100644 --- a/vpr/src/route/router_delay_profiling.cpp +++ b/vpr/src/route/router_delay_profiling.cpp @@ -6,7 +6,6 @@ #include "route_tree.h" #include "rr_graph.h" #include "vtr_time.h" -#include "draw.h" RouterDelayProfiler::RouterDelayProfiler(const Netlist<>& net_list, const RouterLookahead* lookahead, diff --git a/vpr/src/util/vpr_utils.cpp b/vpr/src/util/vpr_utils.cpp index c2aa98286c0..430b386562f 100644 --- a/vpr/src/util/vpr_utils.cpp +++ b/vpr/src/util/vpr_utils.cpp @@ -708,7 +708,7 @@ InstPort parse_inst_port(const std::string& str) { VPR_FATAL_ERROR(VPR_ERROR_ARCH, "Failed to find block type named %s", inst_port.instance_name().c_str()); } - int num_pins = find_tile_port_by_name(blk_type, inst_port.port_name().c_str()).num_pins; + int num_pins = find_tile_port_by_name(blk_type, inst_port.port_name()).num_pins; if (num_pins == OPEN) { VPR_FATAL_ERROR(VPR_ERROR_ARCH, "Failed to find port %s on block type %s", inst_port.port_name().c_str(), inst_port.instance_name().c_str()); @@ -1857,6 +1857,33 @@ bool node_in_same_physical_tile(RRNodeId node_first, RRNodeId node_second) { } } +bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node) { + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + VTR_ASSERT(rr_graph.node_type(src_rr_node) == SOURCE && rr_graph.node_type(sink_rr_node) == SINK); + + // A direct connection is defined as a specific path: `SOURCE -> OPIN -> IPIN -> SINK`. + //TODO: This is a constant depth search, but still may be too slow + for (t_edge_size i_src_edge = 0; i_src_edge < rr_graph.num_edges(src_rr_node); ++i_src_edge) { + RRNodeId opin_rr_node = rr_graph.edge_sink_node(src_rr_node, i_src_edge); + + if (rr_graph.node_type(opin_rr_node) != OPIN) continue; + + for (t_edge_size i_opin_edge = 0; i_opin_edge < rr_graph.num_edges(opin_rr_node); ++i_opin_edge) { + RRNodeId ipin_rr_node = rr_graph.edge_sink_node(opin_rr_node, i_opin_edge); + if (rr_graph.node_type(ipin_rr_node) != IPIN) continue; + + for (t_edge_size i_ipin_edge = 0; i_ipin_edge < rr_graph.num_edges(ipin_rr_node); ++i_ipin_edge) { + if (sink_rr_node == rr_graph.edge_sink_node(ipin_rr_node, i_ipin_edge)) { + return true; + } + } + } + } + return false; +} + std::vector get_cluster_netlist_intra_tile_classes_at_loc(int layer, int i, int j, diff --git a/vpr/src/util/vpr_utils.h b/vpr/src/util/vpr_utils.h index 8869cc55ddd..abaafadbfe7 100644 --- a/vpr/src/util/vpr_utils.h +++ b/vpr/src/util/vpr_utils.h @@ -264,9 +264,28 @@ RRNodeId get_class_rr_node_id(const RRSpatialLookup& rr_spatial_lookup, const int j, int class_physical_num); -// Check whether the given nodes are in the same cluster +/// @brief Check whether the given nodes are in the same cluster bool node_in_same_physical_tile(RRNodeId node_first, RRNodeId node_second); +/** + * @brief Checks if a direct connection exists between two RR nodes. + * + * A direct connection is defined as a specific path: `SOURCE -> OPIN -> IPIN -> SINK`. + * + * @param src_rr_node The source RR node (must be of type `SOURCE`). + * @param sink_rr_node The sink RR node (must be of type `SINK`). + * + * @return `true` if a direct connection exists between the source and sink nodes; + * otherwise, `false`. + * + * @details + * - The function performs a depth-limited search starting from the source node, + * traversing through OPIN, IPIN, and finally checking if the path reaches the sink node. + * - Ensures the specified node types are respected (e.g., source node must be of type `SOURCE`). + */ + +bool directconnect_exists(RRNodeId src_rr_node, RRNodeId sink_rr_node); + std::vector get_cluster_netlist_intra_tile_classes_at_loc(int layer, int i, int j, diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp index a106ad80a80..2b584daedc3 100644 --- a/vpr/test/test_connection_router.cpp +++ b/vpr/test/test_connection_router.cpp @@ -8,7 +8,6 @@ #include "globals.h" #include "net_delay.h" #include "place_and_route.h" -#include "timing_place_lookup.h" static constexpr const char kArchFile[] = "../../vtr_flow/arch/timing/k6_frac_N10_mem32K_40nm.xml"; static constexpr int kMaxHops = 10; @@ -188,8 +187,7 @@ TEST_CASE("connection_router", "[vpr]") { // Clean up free_routing_structs(); - vpr_free_all(arch, - vpr_setup); + vpr_free_all(arch, vpr_setup); } } // namespace diff --git a/vpr/test/test_post_verilog.cpp b/vpr/test/test_post_verilog.cpp index a8344fa79d4..ca1a250b7d2 100644 --- a/vpr/test/test_post_verilog.cpp +++ b/vpr/test/test_post_verilog.cpp @@ -1,7 +1,7 @@ #include "catch2/catch_test_macros.hpp" #include "vpr_api.h" -#include "timing_place_lookup.h" +#include "router_delay_profiling.h" #include #include