[thinlto] Basic thinlto fdo heuristic

Summary:
This patch improves thinlto importer
by importing 3x larger functions that are called from hot block.

I compared performance with the trunk on spec, and there
were about 2% on povray and 3.33% on milc. These results seems
to be consistant and match the results Teresa got with her simple
heuristic. Some benchmarks got slower but I think they are just
noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with
more iterations to confirm. Geomean of all benchmarks including the noisy ones
were about +0.02%.

I see much better improvement on google branch with Easwaran patch
for pgo callsite inlining (the inliner actually inline those big functions)
Over all I see +0.5% improvement, and I get +8.65% on povray.
So I guess we will see much bigger change when Easwaran patch will land
(it depends on new pass manager), but it is still worth putting this to trunk
before it.

Implementation details changes:
- Removed CallsiteCount.
- ProfileCount got replaced by Hotness
- hot-import-multiplier is set to 3.0 for now,
didn't have time to tune it up, but I see that we get most of the interesting
functions with 3, so there is no much performance difference with higher, and
binary size doesn't grow as much as with 10.0.

Reviewers: eraman, mehdi_amini, tejohnson

Subscribers: mehdi_amini, llvm-commits

Differential Revision: https://reviews.llvm.org/D24638

llvm-svn: 282437
This commit is contained in:
Piotr Padlewski
2016-09-26 20:37:32 +00:00
parent 2cd84c905d
commit d9830eb79f
21 changed files with 422 additions and 80 deletions

View File

@@ -3293,10 +3293,8 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
bool HasProfileData = F.getEntryCount().hasValue();
for (auto &ECI : Calls) {
NameVals.push_back(getValueId(ECI.first));
assert(ECI.second.CallsiteCount > 0 && "Expected at least one callsite");
NameVals.push_back(ECI.second.CallsiteCount);
if (HasProfileData)
NameVals.push_back(ECI.second.ProfileCount);
NameVals.push_back(static_cast<uint8_t>(ECI.second.Hotness));
}
unsigned FSAbbrev = (HasProfileData ? FSCallsProfileAbbrev : FSCallsAbbrev);
@@ -3336,7 +3334,7 @@ void ModuleBitcodeWriter::writeModuleLevelReferences(
// Current version for the summary.
// This is bumped whenever we introduce changes in the way some record are
// interpreted, like flags for instance.
static const uint64_t INDEX_VERSION = 1;
static const uint64_t INDEX_VERSION = 2;
/// Emit the per-module summary section alongside the rest of
/// the module's bitcode.
@@ -3357,7 +3355,7 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
// numrefs x valueid, n x (valueid, callsitecount)
// numrefs x valueid, n x (valueid)
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
@@ -3369,7 +3367,7 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
// numrefs x valueid, n x (valueid, callsitecount, profilecount)
// numrefs x valueid, n x (valueid, hotness)
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
@@ -3442,7 +3440,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
// numrefs x valueid, n x (valueid, callsitecount)
// numrefs x valueid, n x (valueid)
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
@@ -3455,7 +3453,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
// numrefs x valueid, n x (valueid, callsitecount, profilecount)
// numrefs x valueid, n x (valueid, hotness)
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
@@ -3542,7 +3540,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
bool HasProfileData = false;
for (auto &EI : FS->calls()) {
HasProfileData |= EI.second.ProfileCount != 0;
HasProfileData |= EI.second.Hotness != CalleeInfo::HotnessType::Unknown;
if (HasProfileData)
break;
}
@@ -3553,10 +3551,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
if (!hasValueId(EI.first.getGUID()))
continue;
NameVals.push_back(getValueId(EI.first.getGUID()));
assert(EI.second.CallsiteCount > 0 && "Expected at least one callsite");
NameVals.push_back(EI.second.CallsiteCount);
if (HasProfileData)
NameVals.push_back(EI.second.ProfileCount);
NameVals.push_back(static_cast<uint8_t>(EI.second.Hotness));
}
unsigned FSAbbrev = (HasProfileData ? FSCallsProfileAbbrev : FSCallsAbbrev);