The program 

Once we've done that, we may move the previous main() into a different function, leaving ourselves with a blank canvas for main() again. We're now ready for the meat of the program. This is a skeleton program. You're encouraged to actually actively change the program while writing this:

func main() {
f, err := os.Open("dev.json")
dieIfErr(err)
tweets := load(f)
p := newProcessor()
tweets = p.process(tweets)
expC := 20
distances, last := knn(asMatrix(tweets), expC, clusters.EuclideanDistance)
log.Printf("distances %v | %v", distances, last)
// plot for DBSCAN elbows
plt, err := plot.New()
dieIfErr(err)
plotutil.AddLinePoints(plt, "KNN Distance", plotKNNDist(last))
plt.Save(25*vg.Centimeter, 25*vg.Centimeter, "KNNDist.png")
// actually do the clustering
dmmClust := dmm(tweets, expC, p.corpus.Size())
kmeansClust := kmeans(tweets, expC)
dbscanClust, clustCount := dbscan(tweets)
// print output
log.Printf("len(tweets)%d", len(tweets))
var buf bytes.Buffer
bc := byClusters2(dmmClust, expC)
lc, tweetCount := largestCluster2(dmmClust)
fmt.Fprintf(&buf, "Largest Cluster %d - %d tweets ", lc, tweetCount)
for i, t := range bc {
fmt.Fprintf(&buf, "CLUSTER %d: %d ", i, len(t))
for _, c := range t {
fmt.Fprintf(&buf, " %v ", tweets[c].clean2)
}
}
fmt.Fprintf(&buf, "============== ")
bc2 := byClusters(kmeansClust, expC)
for i, t := range bc2 {
fmt.Fprintf(&buf, "CLUSTER %d: %d ", i, len(t))
for _, c := range t {
fmt.Fprintf(&buf, " %v ", tweets[c].clean2)
}
}
fmt.Fprintf(&buf, "============== ")
bc3 := byClusters(dbscanClust, clustCount)
for i, t := range bc3 {
fmt.Fprintf(&buf, "CLUSTER %d: %d ", i, len(t))
for _, c := range t {
fmt.Fprintf(&buf, " %v ", tweets[c].clean2)
}
}
log.Println(buf.String())
}

There are some utility functions that I have yet to show you. Now it's time to define them:

 func dmm(a []*processedTweet, expC int, corpusSize int) []dmmclust.Cluster {
conf := dmmclust.Config{
K: expC,
Vocabulary: corpusSize,
Iter: 1000,
Alpha: 0.0,
Beta: 0.01,
Score: dmmclust.Algorithm4,
Sampler: dmmclust.NewGibbs(rand.New(rand.NewSource(1337))),
}
dmmClust, err := dmmclust.FindClusters(toDocs(a), conf)
dieIfErr(err)
return dmmClust
}
func kmeans(a []*processedTweet, expC int) []int {
// create a clusterer
kmeans, err := clusters.KMeans(100000, expC, clusters.EuclideanDistance)
dieIfErr(err)
data := asMatrix(a)
dieIfErr(kmeans.Learn(data))
return kmeans.Guesses()
}
func dbscan(a []*processedTweet) ([]int, int) {
dbscan, err := clusters.DBSCAN(5, 0.965, 8, clusters.EuclideanDistance)
dieIfErr(err)
data := asMatrix(a)
dieIfErr(dbscan.Learn(data))
clust := dbscan.Guesses()
counter := make(map[int]struct{})
for _, c := range clust {
counter[c] = struct{}{}
}
return clust, len(counter)
}
func largestCluster(clusters []int) (int, int) {
cc := make(map[int]int)
for _, c := range clusters {
cc[c]++
}
var retVal, maxVal int
for k, v := range cc {
if v > maxVal {
retVal = k
maxVal = v
}
}
return retVal, cc[retVal]
}
func largestCluster2(clusters []dmmclust.Cluster) (int, int) {
cc := make(map[int]int)
for _, c := range clusters {
cc[c.ID()]++
}
var retVal, maxVal int
for k, v := range cc {
if v > maxVal {
retVal = k
maxVal = v
}
}
return retVal, cc[retVal]
}
func byClusters(a []int, expectedClusters int) (retVal [][]int) {
if expectedClusters == 0 {
return nil
}
retVal = make([][]int, expectedClusters)
var i, v int
defer func() {
if r := recover(); r != nil {
log.Printf("exp %v | %v", expectedClusters, v)
panic(r)
}
}()
for i, v = range a {
if v == -1 {
// retVal[0] = append(retVal[0], i)
continue
}
retVal[v-1] = append(retVal[v-1], i)
}
return retVal
}
func byClusters2(a []dmmclust.Cluster, expectedClusters int) (retVal [][]int) {
retVal = make([][]int, expectedClusters)
for i, v := range a {
retVal[v.ID()] = append(retVal[v.ID()], i)
}
return retVal
}

These are some of the utility functions that may be found in utils.go. They mainly help with tweaking the program. Now run the program by typing go run *.go.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.14.141.115